(Stephane Marchesin, me) add hyperz support to radeon drm. Only fast z
authorRoland Scheidegger <rscheidegger_lists@hispeed.ch>
Wed, 8 Dec 2004 16:43:00 +0000 (16:43 +0000)
committerRoland Scheidegger <rscheidegger_lists@hispeed.ch>
Wed, 8 Dec 2004 16:43:00 +0000 (16:43 +0000)
    clear and z buffer compression are working correctly, hierarchical-z is
    not.

shared-core/radeon_cp.c
shared-core/radeon_drm.h
shared-core/radeon_drv.h
shared-core/radeon_state.c
shared/radeon.h
shared/radeon_cp.c
shared/radeon_drm.h
shared/radeon_drv.h
shared/radeon_state.c

index ea061eb..3b3604b 100644 (file)
@@ -2007,6 +2007,18 @@ int radeon_preinit(struct drm_device *dev, unsigned long flags)
        dev->dev_private = (void *)dev_priv;
        dev_priv->flags = flags;
 
+       switch (flags & CHIP_FAMILY_MASK) {
+       case CHIP_R100:
+       case CHIP_RV200:
+       case CHIP_R200:
+       case CHIP_R300:
+               dev_priv->flags |= CHIP_HAS_HIERZ;
+               break;
+       default:
+       /* all other chips have no hierarchical z buffer */
+               break;
+       }
+
 #ifdef __linux__
        /* registers */
        if ((ret = drm_initmap(dev, pci_resource_start(dev->pdev, 2),
index 7c60093..78c3e61 100644 (file)
 #define RADEON_EMIT_PP_TEX_SIZE_1                   74
 #define RADEON_EMIT_PP_TEX_SIZE_2                   75
 #define R200_EMIT_RB3D_BLENDCOLOR                   76
-#define RADEON_MAX_STATE_PACKETS                    77
+#define R200_EMIT_TCL_POINT_SPRITE_CNTL             77
+#define RADEON_MAX_STATE_PACKETS                    78
 
 /* Commands understood by cmd_buffer ioctl.  More can be added but
  * obviously these can't be removed or changed:
@@ -189,6 +190,9 @@ typedef union {
 #define RADEON_BACK                    0x2
 #define RADEON_DEPTH                   0x4
 #define RADEON_STENCIL                  0x8
+#define RADEON_CLEAR_FASTZ             0x80000000
+#define RADEON_USE_HIERZ               0x40000000
+#define RADEON_USE_COMP_ZBUF           0x20000000
 
 /* Primitive types
  */
index 617a7ed..cd75bc1 100644 (file)
@@ -42,7 +42,7 @@
 
 #define DRIVER_NAME            "radeon"
 #define DRIVER_DESC            "ATI Radeon"
-#define DRIVER_DATE            "20020828"
+#define DRIVER_DATE            "20041207"
 
 /* Interface history:
  *
  *       and GL_EXT_blend_[func|equation]_separate on r200
  * 1.12- Add R300 CP microcode support - this just loads the CP on r300
  *       (No 3D support yet - just microcode loading).
+ * 1.13- Add packet R200_EMIT_TCL_POINT_SPRITE_CNTL for ARB_point_parameters
+ *     - Add hyperz support, add hyperz flags to clear ioctl.
  */
 
 #define DRIVER_MAJOR           1
-#define DRIVER_MINOR           12
+#define DRIVER_MINOR           13
 #define DRIVER_PATCHLEVEL      0
 
 enum radeon_family {
@@ -117,6 +119,7 @@ enum radeon_chip_flags {
        CHIP_IS_IGP = 0x00020000UL,
        CHIP_SINGLE_CRTC = 0x00040000UL,
        CHIP_IS_AGP = 0x00080000UL,
+       CHIP_HAS_HIERZ = 0x00100000UL, 
 };
 
 #define GET_RING_HEAD(dev_priv)                DRM_READ32(  (dev_priv)->ring_rptr, 0 )
@@ -466,6 +469,7 @@ extern void radeon_driver_free_filp_priv(drm_device_t * dev,
 #      define RADEON_STENCIL_ENABLE            (1 << 7)
 #      define RADEON_Z_ENABLE                  (1 << 8)
 #define RADEON_RB3D_DEPTHOFFSET                0x1c24
+#define RADEON_RB3D_DEPTHCLEARVALUE    0x3230
 #define RADEON_RB3D_DEPTHPITCH         0x1c28
 #define RADEON_RB3D_PLANEMASK          0x1d84
 #define RADEON_RB3D_STENCILREFMASK     0x1d7c
@@ -478,11 +482,15 @@ extern void radeon_driver_free_filp_priv(drm_device_t * dev,
 #define RADEON_RB3D_ZSTENCILCNTL       0x1c2c
 #      define RADEON_Z_TEST_MASK               (7 << 4)
 #      define RADEON_Z_TEST_ALWAYS             (7 << 4)
+#      define RADEON_Z_HIERARCHY_ENABLE        (1 << 8)
 #      define RADEON_STENCIL_TEST_ALWAYS       (7 << 12)
 #      define RADEON_STENCIL_S_FAIL_REPLACE    (2 << 16)
 #      define RADEON_STENCIL_ZPASS_REPLACE     (2 << 20)
 #      define RADEON_STENCIL_ZFAIL_REPLACE     (2 << 24)
+#      define RADEON_Z_COMPRESSION_ENABLE      (1 << 28)
+#      define RADEON_FORCE_Z_DIRTY             (1 << 29)
 #      define RADEON_Z_WRITE_ENABLE            (1 << 30)
+#      define RADEON_Z_DECOMPRESSION_ENABLE    (1 << 31)
 #define RADEON_RBBM_SOFT_RESET         0x00f0
 #      define RADEON_SOFT_RESET_CP             (1 <<  0)
 #      define RADEON_SOFT_RESET_HI             (1 <<  1)
@@ -590,7 +598,7 @@ extern void radeon_driver_free_filp_priv(drm_device_t * dev,
 #      define RADEON_WAIT_3D_IDLECLEAN         (1 << 17)
 #      define RADEON_WAIT_HOST_IDLECLEAN       (1 << 18)
 
-#define RADEON_RB3D_ZMASKOFFSET                0x1c34
+#define RADEON_RB3D_ZMASKOFFSET                0x3234
 #define RADEON_RB3D_ZSTENCILCNTL       0x1c2c
 #      define RADEON_DEPTH_FORMAT_16BIT_INT_Z  (0 << 0)
 #      define RADEON_DEPTH_FORMAT_24BIT_INT_Z  (2 << 0)
@@ -644,6 +652,8 @@ extern void radeon_driver_free_filp_priv(drm_device_t * dev,
 #      define RADEON_3D_DRAW_IMMD              0x00002900
 #      define RADEON_3D_DRAW_INDX              0x00002A00
 #      define RADEON_3D_LOAD_VBPNTR            0x00002F00
+#      define RADEON_3D_CLEAR_ZMASK            0x00003200
+#      define RADEON_3D_CLEAR_HIZ              0x00003700
 #      define RADEON_CNTL_HOSTDATA_BLT         0x00009400
 #      define RADEON_CNTL_PAINT_MULTI          0x00009A00
 #      define RADEON_CNTL_BITBLT_MULTI         0x00009B00
@@ -801,6 +811,8 @@ extern void radeon_driver_free_filp_priv(drm_device_t * dev,
 
 #define R200_RB3D_BLENDCOLOR              0x3218
 
+#define R200_SE_TCL_POINT_SPRITE_CNTL     0x22c4
+
 /* Constants */
 #define RADEON_MAX_USEC_TIMEOUT                100000  /* 100 ms */
 
index ec85efa..5ec9b35 100644 (file)
@@ -271,6 +271,7 @@ static __inline__ int radeon_check_and_fixup_packets(drm_radeon_private_t *
        case RADEON_EMIT_PP_TEX_SIZE_1:
        case RADEON_EMIT_PP_TEX_SIZE_2:
        case R200_EMIT_RB3D_BLENDCOLOR:
+       case R200_EMIT_TCL_POINT_SPRITE_CNTL:
                /* These packets don't contain memory offsets */
                break;
 
@@ -646,7 +647,9 @@ static struct {
        RADEON_PP_TEX_SIZE_0, 2, "RADEON_PP_TEX_SIZE_0"}, {
        RADEON_PP_TEX_SIZE_1, 2, "RADEON_PP_TEX_SIZE_1"}, {
        RADEON_PP_TEX_SIZE_2, 2, "RADEON_PP_TEX_SIZE_2"}, {
-R200_RB3D_BLENDCOLOR, 3, "R200_RB3D_BLENDCOLOR"},};
+       R200_RB3D_BLENDCOLOR, 3, "R200_RB3D_BLENDCOLOR"}, {
+       R200_SE_TCL_POINT_SPRITE_CNTL, 1, "R200_SE_TCL_POINT_SPRITE_CNTL"},
+};
 
 /* ================================================================
  * Performance monitoring functions
@@ -858,11 +861,159 @@ static void radeon_cp_dispatch_clear(drm_device_t * dev,
                }
        }
 
+       /* hyper z clear */
+       /* no docs available, based on reverse engeneering by Stephane Marchesin */
+       if ((flags & (RADEON_DEPTH | RADEON_STENCIL)) && (flags & RADEON_CLEAR_FASTZ)) {
+
+               int i;
+               int depthpixperline = dev_priv->depth_fmt==RADEON_DEPTH_FORMAT_16BIT_INT_Z? 
+                       (dev_priv->depth_pitch / 2): (dev_priv->depth_pitch / 4);
+               
+               u32 clearmask;
+
+               u32 tempRB3D_DEPTHCLEARVALUE = clear->clear_depth |
+                       ((clear->depth_mask & 0xff) << 24);
+       
+               
+               /* Make sure we restore the 3D state next time.
+                * we haven't touched any "normal" state - still need this?
+                */
+               dev_priv->sarea_priv->ctx_owner = 0;
+
+               if ((dev_priv->flags & CHIP_HAS_HIERZ) && (flags & RADEON_USE_HIERZ)) {
+               /* FIXME : reverse engineer that for Rx00 cards */
+               /* FIXME : the mask supposedly contains low-res z values. So can't set
+                  just to the max (0xff? or actually 0x3fff?), need to take z clear
+                  value into account? */
+               /* pattern seems to work for r100, though get slight
+                  rendering errors with glxgears. If hierz is not enabled for r100,
+                  only 4 bits which indicate clear (15,16,31,32, all zero) matter, the
+                  other ones are ignored, and the same clear mask can be used. That's
+                  very different behaviour than R200 which needs different clear mask
+                  and different number of tiles to clear if hierz is enabled or not !?!
+               */
+                       clearmask = (0xff<<22)|(0xff<<6)| 0x003f003f;
+               }
+               else {
+               /* clear mask : chooses the clearing pattern.
+                  rv250: could be used to clear only parts of macrotiles
+                  (but that would get really complicated...)?
+                  bit 0 and 1 (either or both of them ?!?!) are used to
+                  not clear tile (or maybe one of the bits indicates if the tile is
+                  compressed or not), bit 2 and 3 to not clear tile 1,...,.
+                  Pattern is as follows:
+                       | 0,1 | 4,5 | 8,9 |12,13|16,17|20,21|24,25|28,29|
+                  bits -------------------------------------------------
+                       | 2,3 | 6,7 |10,11|14,15|18,19|22,23|26,27|30,31|
+                  rv100: clearmask covers 2x8 4x1 tiles, but one clear still
+                  covers 256 pixels ?!?
+               */
+                       clearmask = 0x0;
+               }
+
+               BEGIN_RING( 8 );
+               RADEON_WAIT_UNTIL_2D_IDLE();
+               OUT_RING_REG( RADEON_RB3D_DEPTHCLEARVALUE,
+                       tempRB3D_DEPTHCLEARVALUE);
+               /* what offset is this exactly ? */
+               OUT_RING_REG( RADEON_RB3D_ZMASKOFFSET, 0 );
+               /* need ctlstat, otherwise get some strange black flickering */
+               OUT_RING_REG( RADEON_RB3D_ZCACHE_CTLSTAT, RADEON_RB3D_ZC_FLUSH_ALL );
+               ADVANCE_RING();
+
+               for (i = 0; i < nbox; i++) {
+                       int tileoffset, nrtilesx, nrtilesy, j;
+                       /* it looks like r200 needs rv-style clears, at least if hierz is not enabled? */
+                       if ((dev_priv->flags&CHIP_HAS_HIERZ) && !(dev_priv->microcode_version==UCODE_R200)) {
+                               /* FIXME : figure this out for r200 (when hierz is enabled). Or
+                                  maybe r200 actually doesn't need to put the low-res z value into
+                                  the tile cache like r100, but just needs to clear the hi-level z-buffer?
+                                  Works for R100, both with hierz and without.
+                                  R100 seems to operate on 2x1 8x8 tiles, but...
+                                  odd: offset/nrtiles need to be 64 pix (4 block) aligned? Potentially
+                                  problematic with resolutions which are not 64 pix aligned? */
+                               tileoffset = ((pbox[i].y1 >> 3) * depthpixperline + pbox[i].x1) >> 6;
+                               nrtilesx = ((pbox[i].x2 & ~63) - (pbox[i].x1 & ~63)) >> 4;
+                               nrtilesy = (pbox[i].y2 >> 3) - (pbox[i].y1 >> 3);
+                               for (j = 0; j <= nrtilesy; j++) {
+                                       BEGIN_RING( 4 );
+                                       OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_ZMASK, 2 ) );
+                                       /* first tile */
+                                       OUT_RING( tileoffset * 8 );
+                                       /* the number of tiles to clear */
+                                       OUT_RING( nrtilesx + 4 );
+                                       /* clear mask : chooses the clearing pattern. */
+                                       OUT_RING( clearmask );
+                                       ADVANCE_RING();
+                                       tileoffset += depthpixperline >> 6;
+                               }
+                       }
+                       else if (dev_priv->microcode_version==UCODE_R200) {
+                               /* works for rv250. */
+                               /* find first macro tile (8x2 4x4 z-pixels on rv250) */
+                               tileoffset = ((pbox[i].y1 >> 3) * depthpixperline + pbox[i].x1) >> 5;
+                               nrtilesx = (pbox[i].x2 >> 5) - (pbox[i].x1 >> 5);
+                               nrtilesy = (pbox[i].y2 >> 3) - (pbox[i].y1 >> 3);
+                               for (j = 0; j <= nrtilesy; j++) {
+                                       BEGIN_RING( 4 );
+                                       OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_ZMASK, 2 ) );
+                                       /* first tile */
+                                       /* judging by the first tile offset needed, could possibly
+                                          directly address/clear 4x4 tiles instead of 8x2 * 4x4
+                                          macro tiles, though would still need clear mask for
+                                          right/bottom if truely 4x4 granularity is desired ? */
+                                       OUT_RING( tileoffset * 16 );
+                                       /* the number of tiles to clear */
+                                       OUT_RING( nrtilesx + 1 );
+                                       /* clear mask : chooses the clearing pattern. */
+                                       OUT_RING( clearmask );
+                                       ADVANCE_RING();
+                                       tileoffset += depthpixperline >> 5;
+                               }
+                       }
+                       else { /* rv 100 */
+                               /* rv100 might not need 64 pix alignment, who knows */
+                               /* offsets are, hmm, weird */
+                               tileoffset = ((pbox[i].y1 >> 4) * depthpixperline + pbox[i].x1) >> 6;
+                               nrtilesx = ((pbox[i].x2 & ~63) - (pbox[i].x1 & ~63)) >> 4;
+                               nrtilesy = (pbox[i].y2 >> 4) - (pbox[i].y1 >> 4);
+                               for (j = 0; j <= nrtilesy; j++) {
+                                       BEGIN_RING( 4 );
+                                       OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_ZMASK, 2 ) );
+                                       OUT_RING( tileoffset * 128 );
+                                       /* the number of tiles to clear */
+                                       OUT_RING( nrtilesx + 4 );
+                                       /* clear mask : chooses the clearing pattern. */
+                                       OUT_RING( clearmask );
+                                       ADVANCE_RING();
+                                       tileoffset += depthpixperline >> 6;
+                               }
+                       }
+               }
+
+               /* TODO don't always clear all hi-level z tiles */
+               if ((dev_priv->flags & CHIP_HAS_HIERZ) && (dev_priv->microcode_version==UCODE_R200)
+                       && (flags & RADEON_USE_HIERZ))
+               /* r100 and cards without hierarchical z-buffer have no high-level z-buffer */
+               /* FIXME : the mask supposedly contains low-res z values. So can't set
+                  just to the max (0xff? or actually 0x3fff?), need to take z clear
+                  value into account? */
+               {
+                       BEGIN_RING( 4 );
+                       OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_HIZ, 2 ) );
+                       OUT_RING( 0x0 ); /* First tile */
+                       OUT_RING( 0x3cc0 );
+                       OUT_RING( (0xff<<22)|(0xff<<6)| 0x003f003f);
+                       ADVANCE_RING();
+               }
+       }
+
        /* We have to clear the depth and/or stencil buffers by
         * rendering a quad into just those buffers.  Thus, we have to
         * make sure the 3D engine is configured correctly.
         */
-       if ((dev_priv->microcode_version == UCODE_R200) && (flags & (RADEON_DEPTH | RADEON_STENCIL))) {
+       else if ((dev_priv->microcode_version == UCODE_R200) &&
+               (flags & (RADEON_DEPTH | RADEON_STENCIL))) {
 
                int tempPP_CNTL;
                int tempRE_CNTL;
@@ -929,6 +1080,14 @@ static void radeon_cp_dispatch_clear(drm_device_t * dev,
                        tempRB3D_STENCILREFMASK = 0x00000000;
                }
 
+               if (flags & RADEON_USE_COMP_ZBUF) {
+                       tempRB3D_ZSTENCILCNTL |= RADEON_Z_COMPRESSION_ENABLE |
+                               RADEON_Z_DECOMPRESSION_ENABLE;
+               }
+               if (flags & RADEON_USE_HIERZ) {
+                       tempRB3D_ZSTENCILCNTL |= RADEON_Z_HIERARCHY_ENABLE;
+               }
+
                BEGIN_RING(26);
                RADEON_WAIT_UNTIL_2D_IDLE();
 
@@ -979,6 +1138,8 @@ static void radeon_cp_dispatch_clear(drm_device_t * dev,
                }
        } else if ((flags & (RADEON_DEPTH | RADEON_STENCIL))) {
 
+               int tempRB3D_ZSTENCILCNTL = depth_clear->rb3d_zstencilcntl;
+               
                rb3d_cntl = depth_clear->rb3d_cntl;
 
                if (flags & RADEON_DEPTH) {
@@ -995,6 +1156,14 @@ static void radeon_cp_dispatch_clear(drm_device_t * dev,
                        rb3d_stencilrefmask = 0x00000000;
                }
 
+               if (flags & RADEON_USE_COMP_ZBUF) {
+                       tempRB3D_ZSTENCILCNTL |= RADEON_Z_COMPRESSION_ENABLE |
+                               RADEON_Z_DECOMPRESSION_ENABLE;
+               }
+               if (flags & RADEON_USE_HIERZ) {
+                       tempRB3D_ZSTENCILCNTL |= RADEON_Z_HIERARCHY_ENABLE;
+               }
+
                BEGIN_RING(13);
                RADEON_WAIT_UNTIL_2D_IDLE();
 
@@ -1002,8 +1171,7 @@ static void radeon_cp_dispatch_clear(drm_device_t * dev,
                OUT_RING(0x00000000);
                OUT_RING(rb3d_cntl);
 
-               OUT_RING_REG(RADEON_RB3D_ZSTENCILCNTL,
-                            depth_clear->rb3d_zstencilcntl);
+               OUT_RING_REG(RADEON_RB3D_ZSTENCILCNTL, tempRB3D_ZSTENCILCNTL);
                OUT_RING_REG(RADEON_RB3D_STENCILREFMASK, rb3d_stencilrefmask);
                OUT_RING_REG(RADEON_RB3D_PLANEMASK, 0x00000000);
                OUT_RING_REG(RADEON_SE_CNTL, depth_clear->se_cntl);
index 06f5017..80bfa0c 100644 (file)
 
 #define DRIVER_NAME            "radeon"
 #define DRIVER_DESC            "ATI Radeon"
-#define DRIVER_DATE            "20020828"
+#define DRIVER_DATE            "20041207"
 
 #define DRIVER_MAJOR           1
-#define DRIVER_MINOR           12
+#define DRIVER_MINOR           13
 #define DRIVER_PATCHLEVEL      0
 
 /* Interface history:
@@ -82,6 +82,8 @@
  *       and GL_EXT_blend_[func|equation]_separate on r200
  * 1.12- Add R300 CP microcode support - this just loads the CP on r300
  *       (No 3D support yet - just microcode loading).
+ * 1.13- Add packet R200_EMIT_TCL_POINT_SPRITE_CNTL for ARB_point_parameters
+ *     - Add hyperz support, add hyperz flags to clear ioctl.
  */
 #define DRIVER_IOCTLS                                                       \
  [DRM_IOCTL_NR(DRM_IOCTL_DMA)]               = { radeon_cp_buffers,  1, 0 }, \
index 75a7bd5..5d13f47 100644 (file)
@@ -2017,6 +2017,18 @@ int radeon_preinit( struct drm_device *dev, unsigned long flags )
        dev->dev_private = (void *)dev_priv;
        dev_priv->flags = flags;
 
+       switch (flags & CHIP_FAMILY_MASK) {
+       case CHIP_R100:
+       case CHIP_RV200:
+       case CHIP_R200:
+       case CHIP_R300:
+               dev_priv->flags |= CHIP_HAS_HIERZ;
+               break;
+       default:
+       /* all other chips have no hierarchical z buffer */
+               break;
+       }
+
        /* registers */
        if( (ret = DRM(initmap)( dev, pci_resource_start( dev->pdev, 2 ),
                        pci_resource_len( dev->pdev, 2 ), _DRM_REGISTERS, 0 )))
index 14d65ea..e086938 100644 (file)
 #define RADEON_EMIT_PP_TEX_SIZE_1                   74
 #define RADEON_EMIT_PP_TEX_SIZE_2                   75
 #define R200_EMIT_RB3D_BLENDCOLOR                   76
-#define RADEON_MAX_STATE_PACKETS                    77
+#define R200_EMIT_TCL_POINT_SPRITE_CNTL             77
+#define RADEON_MAX_STATE_PACKETS                    78
 
 
 /* Commands understood by cmd_buffer ioctl.  More can be added but
@@ -193,6 +194,9 @@ typedef union {
 #define RADEON_BACK                    0x2
 #define RADEON_DEPTH                   0x4
 #define RADEON_STENCIL                  0x8
+#define RADEON_CLEAR_FASTZ             0x80000000
+#define RADEON_USE_HIERZ               0x40000000
+#define RADEON_USE_COMP_ZBUF           0x20000000
 
 /* Primitive types
  */
index 32a6c3f..9e0e8fe 100644 (file)
@@ -68,6 +68,7 @@ enum radeon_chip_flags {
        CHIP_IS_IGP             = 0x00020000UL,
        CHIP_SINGLE_CRTC        = 0x00040000UL,
        CHIP_IS_AGP             = 0x00080000UL, 
+       CHIP_HAS_HIERZ          = 0x00100000UL, 
 };
 
 #define GET_RING_HEAD(dev_priv)                DRM_READ32(  (dev_priv)->ring_rptr, 0 )
@@ -411,6 +412,7 @@ extern void radeon_driver_irq_uninstall( drm_device_t *dev );
 #      define RADEON_STENCIL_ENABLE            (1 << 7)
 #      define RADEON_Z_ENABLE                  (1 << 8)
 #define RADEON_RB3D_DEPTHOFFSET                0x1c24
+#define RADEON_RB3D_DEPTHCLEARVALUE    0x3230
 #define RADEON_RB3D_DEPTHPITCH         0x1c28
 #define RADEON_RB3D_PLANEMASK          0x1d84
 #define RADEON_RB3D_STENCILREFMASK     0x1d7c
@@ -423,11 +425,15 @@ extern void radeon_driver_irq_uninstall( drm_device_t *dev );
 #define RADEON_RB3D_ZSTENCILCNTL       0x1c2c
 #      define RADEON_Z_TEST_MASK               (7 << 4)
 #      define RADEON_Z_TEST_ALWAYS             (7 << 4)
+#      define RADEON_Z_HIERARCHY_ENABLE        (1 << 8)
 #      define RADEON_STENCIL_TEST_ALWAYS       (7 << 12)
 #      define RADEON_STENCIL_S_FAIL_REPLACE    (2 << 16)
 #      define RADEON_STENCIL_ZPASS_REPLACE     (2 << 20)
 #      define RADEON_STENCIL_ZFAIL_REPLACE     (2 << 24)
+#      define RADEON_Z_COMPRESSION_ENABLE      (1 << 28)
+#      define RADEON_FORCE_Z_DIRTY             (1 << 29)
 #      define RADEON_Z_WRITE_ENABLE            (1 << 30)
+#      define RADEON_Z_DECOMPRESSION_ENABLE    (1 << 31)
 #define RADEON_RBBM_SOFT_RESET         0x00f0
 #      define RADEON_SOFT_RESET_CP             (1 <<  0)
 #      define RADEON_SOFT_RESET_HI             (1 <<  1)
@@ -535,7 +541,7 @@ extern void radeon_driver_irq_uninstall( drm_device_t *dev );
 #      define RADEON_WAIT_3D_IDLECLEAN         (1 << 17)
 #      define RADEON_WAIT_HOST_IDLECLEAN       (1 << 18)
 
-#define RADEON_RB3D_ZMASKOFFSET                0x1c34
+#define RADEON_RB3D_ZMASKOFFSET                0x3234
 #define RADEON_RB3D_ZSTENCILCNTL       0x1c2c
 #      define RADEON_DEPTH_FORMAT_16BIT_INT_Z  (0 << 0)
 #      define RADEON_DEPTH_FORMAT_24BIT_INT_Z  (2 << 0)
@@ -590,6 +596,8 @@ extern void radeon_driver_irq_uninstall( drm_device_t *dev );
 #      define RADEON_3D_DRAW_IMMD              0x00002900
 #      define RADEON_3D_DRAW_INDX              0x00002A00
 #      define RADEON_3D_LOAD_VBPNTR            0x00002F00
+#      define RADEON_3D_CLEAR_ZMASK            0x00003200
+#      define RADEON_3D_CLEAR_HIZ              0x00003700
 #      define RADEON_CNTL_HOSTDATA_BLT         0x00009400
 #      define RADEON_CNTL_PAINT_MULTI          0x00009A00
 #      define RADEON_CNTL_BITBLT_MULTI         0x00009B00
@@ -748,6 +756,8 @@ extern void radeon_driver_irq_uninstall( drm_device_t *dev );
 
 #define R200_RB3D_BLENDCOLOR              0x3218
 
+#define R200_SE_TCL_POINT_SPRITE_CNTL     0x22c4
+
 /* Constants */
 #define RADEON_MAX_USEC_TIMEOUT                100000  /* 100 ms */
 
index 3cafd9a..caba6a3 100644 (file)
@@ -205,6 +205,7 @@ static __inline__ int radeon_check_and_fixup_packets( drm_radeon_private_t *dev_
        case RADEON_EMIT_PP_TEX_SIZE_1:
        case RADEON_EMIT_PP_TEX_SIZE_2:
        case R200_EMIT_RB3D_BLENDCOLOR:
+       case R200_EMIT_TCL_POINT_SPRITE_CNTL:
                /* These packets don't contain memory offsets */
                break;
 
@@ -569,6 +570,7 @@ static struct {
        { RADEON_PP_TEX_SIZE_1, 2, "RADEON_PP_TEX_SIZE_1" },
        { RADEON_PP_TEX_SIZE_2, 2, "RADEON_PP_TEX_SIZE_2" },
        { R200_RB3D_BLENDCOLOR, 3, "R200_RB3D_BLENDCOLOR" },
+       { R200_SE_TCL_POINT_SPRITE_CNTL, 1, "R200_SE_TCL_POINT_SPRITE_CNTL"},
 };
 
 
@@ -780,12 +782,159 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev,
                }
        }
 
+       /* hyper z clear */
+       /* no docs available, based on reverse engeneering by Stephane Marchesin */
+       if ((flags & (RADEON_DEPTH | RADEON_STENCIL)) && (flags & RADEON_CLEAR_FASTZ)) {
+
+               int i;
+               int depthpixperline = dev_priv->depth_fmt==RADEON_DEPTH_FORMAT_16BIT_INT_Z? 
+                       (dev_priv->depth_pitch / 2): (dev_priv->depth_pitch / 4);
+               
+               u32 clearmask;
+
+               u32 tempRB3D_DEPTHCLEARVALUE = clear->clear_depth |
+                       ((clear->depth_mask & 0xff) << 24);
+       
+               
+               /* Make sure we restore the 3D state next time.
+                * we haven't touched any "normal" state - still need this?
+                */
+               dev_priv->sarea_priv->ctx_owner = 0;
+
+               if ((dev_priv->flags & CHIP_HAS_HIERZ) && (flags & RADEON_USE_HIERZ)) {
+               /* FIXME : reverse engineer that for Rx00 cards */
+               /* FIXME : the mask supposedly contains low-res z values. So can't set
+                  just to the max (0xff? or actually 0x3fff?), need to take z clear
+                  value into account? */
+               /* pattern seems to work for r100, though get slight
+                  rendering errors with glxgears. If hierz is not enabled for r100,
+                  only 4 bits which indicate clear (15,16,31,32, all zero) matter, the
+                  other ones are ignored, and the same clear mask can be used. That's
+                  very different behaviour than R200 which needs different clear mask
+                  and different number of tiles to clear if hierz is enabled or not !?!
+               */
+                       clearmask = (0xff<<22)|(0xff<<6)| 0x003f003f;
+               }
+               else {
+               /* clear mask : chooses the clearing pattern.
+                  rv250: could be used to clear only parts of macrotiles
+                  (but that would get really complicated...)?
+                  bit 0 and 1 (either or both of them ?!?!) are used to
+                  not clear tile (or maybe one of the bits indicates if the tile is
+                  compressed or not), bit 2 and 3 to not clear tile 1,...,.
+                  Pattern is as follows:
+                       | 0,1 | 4,5 | 8,9 |12,13|16,17|20,21|24,25|28,29|
+                  bits -------------------------------------------------
+                       | 2,3 | 6,7 |10,11|14,15|18,19|22,23|26,27|30,31|
+                  rv100: clearmask covers 2x8 4x1 tiles, but one clear still
+                  covers 256 pixels ?!?
+               */
+                       clearmask = 0x0;
+               }
+
+               BEGIN_RING( 8 );
+               RADEON_WAIT_UNTIL_2D_IDLE();
+               OUT_RING_REG( RADEON_RB3D_DEPTHCLEARVALUE,
+                       tempRB3D_DEPTHCLEARVALUE);
+               /* what offset is this exactly ? */
+               OUT_RING_REG( RADEON_RB3D_ZMASKOFFSET, 0 );
+               /* need ctlstat, otherwise get some strange black flickering */
+               OUT_RING_REG( RADEON_RB3D_ZCACHE_CTLSTAT, RADEON_RB3D_ZC_FLUSH_ALL );
+               ADVANCE_RING();
+
+               for (i = 0; i < nbox; i++) {
+                       int tileoffset, nrtilesx, nrtilesy, j;
+                       /* it looks like r200 needs rv-style clears, at least if hierz is not enabled? */
+                       if ((dev_priv->flags&CHIP_HAS_HIERZ) && !(dev_priv->microcode_version==UCODE_R200)) {
+                               /* FIXME : figure this out for r200 (when hierz is enabled). Or
+                                  maybe r200 actually doesn't need to put the low-res z value into
+                                  the tile cache like r100, but just needs to clear the hi-level z-buffer?
+                                  Works for R100, both with hierz and without.
+                                  R100 seems to operate on 2x1 8x8 tiles, but...
+                                  odd: offset/nrtiles need to be 64 pix (4 block) aligned? Potentially
+                                  problematic with resolutions which are not 64 pix aligned? */
+                               tileoffset = ((pbox[i].y1 >> 3) * depthpixperline + pbox[i].x1) >> 6;
+                               nrtilesx = ((pbox[i].x2 & ~63) - (pbox[i].x1 & ~63)) >> 4;
+                               nrtilesy = (pbox[i].y2 >> 3) - (pbox[i].y1 >> 3);
+                               for (j = 0; j <= nrtilesy; j++) {
+                                       BEGIN_RING( 4 );
+                                       OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_ZMASK, 2 ) );
+                                       /* first tile */
+                                       OUT_RING( tileoffset * 8 );
+                                       /* the number of tiles to clear */
+                                       OUT_RING( nrtilesx + 4 );
+                                       /* clear mask : chooses the clearing pattern. */
+                                       OUT_RING( clearmask );
+                                       ADVANCE_RING();
+                                       tileoffset += depthpixperline >> 6;
+                               }
+                       }
+                       else if (dev_priv->microcode_version==UCODE_R200) {
+                               /* works for rv250. */
+                               /* find first macro tile (8x2 4x4 z-pixels on rv250) */
+                               tileoffset = ((pbox[i].y1 >> 3) * depthpixperline + pbox[i].x1) >> 5;
+                               nrtilesx = (pbox[i].x2 >> 5) - (pbox[i].x1 >> 5);
+                               nrtilesy = (pbox[i].y2 >> 3) - (pbox[i].y1 >> 3);
+                               for (j = 0; j <= nrtilesy; j++) {
+                                       BEGIN_RING( 4 );
+                                       OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_ZMASK, 2 ) );
+                                       /* first tile */
+                                       /* judging by the first tile offset needed, could possibly
+                                          directly address/clear 4x4 tiles instead of 8x2 * 4x4
+                                          macro tiles, though would still need clear mask for
+                                          right/bottom if truely 4x4 granularity is desired ? */
+                                       OUT_RING( tileoffset * 16 );
+                                       /* the number of tiles to clear */
+                                       OUT_RING( nrtilesx + 1 );
+                                       /* clear mask : chooses the clearing pattern. */
+                                       OUT_RING( clearmask );
+                                       ADVANCE_RING();
+                                       tileoffset += depthpixperline >> 5;
+                               }
+                       }
+                       else { /* rv 100 */
+                               /* rv100 might not need 64 pix alignment, who knows */
+                               /* offsets are, hmm, weird */
+                               tileoffset = ((pbox[i].y1 >> 4) * depthpixperline + pbox[i].x1) >> 6;
+                               nrtilesx = ((pbox[i].x2 & ~63) - (pbox[i].x1 & ~63)) >> 4;
+                               nrtilesy = (pbox[i].y2 >> 4) - (pbox[i].y1 >> 4);
+                               for (j = 0; j <= nrtilesy; j++) {
+                                       BEGIN_RING( 4 );
+                                       OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_ZMASK, 2 ) );
+                                       OUT_RING( tileoffset * 128 );
+                                       /* the number of tiles to clear */
+                                       OUT_RING( nrtilesx + 4 );
+                                       /* clear mask : chooses the clearing pattern. */
+                                       OUT_RING( clearmask );
+                                       ADVANCE_RING();
+                                       tileoffset += depthpixperline >> 6;
+                               }
+                       }
+               }
+
+               /* TODO don't always clear all hi-level z tiles */
+               if ((dev_priv->flags & CHIP_HAS_HIERZ) && (dev_priv->microcode_version==UCODE_R200)
+                       && (flags & RADEON_USE_HIERZ))
+               /* r100 and cards without hierarchical z-buffer have no high-level z-buffer */
+               /* FIXME : the mask supposedly contains low-res z values. So can't set
+                  just to the max (0xff? or actually 0x3fff?), need to take z clear
+                  value into account? */
+               {
+                       BEGIN_RING( 4 );
+                       OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_HIZ, 2 ) );
+                       OUT_RING( 0x0 ); /* First tile */
+                       OUT_RING( 0x3cc0 );
+                       OUT_RING( (0xff<<22)|(0xff<<6)| 0x003f003f);
+                       ADVANCE_RING();
+               }
+       }
+
        /* We have to clear the depth and/or stencil buffers by
         * rendering a quad into just those buffers.  Thus, we have to
         * make sure the 3D engine is configured correctly.
         */
-       if ( (dev_priv->microcode_version==UCODE_R200) &&
-            (flags & (RADEON_DEPTH | RADEON_STENCIL)) ) {
+       else if ((dev_priv->microcode_version == UCODE_R200) &&
+               (flags & (RADEON_DEPTH | RADEON_STENCIL))) {
 
                int tempPP_CNTL;
                int tempRE_CNTL;
@@ -855,6 +1004,14 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev,
                        tempRB3D_STENCILREFMASK = 0x00000000;
                }
 
+               if (flags & RADEON_USE_COMP_ZBUF) {
+                       tempRB3D_ZSTENCILCNTL |= RADEON_Z_COMPRESSION_ENABLE |
+                               RADEON_Z_DECOMPRESSION_ENABLE;
+               }
+               if (flags & RADEON_USE_HIERZ) {
+                       tempRB3D_ZSTENCILCNTL |= RADEON_Z_HIERARCHY_ENABLE;
+               }
+
                BEGIN_RING( 26 );
                RADEON_WAIT_UNTIL_2D_IDLE();
 
@@ -909,6 +1066,8 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev,
        } 
        else if ( (flags & (RADEON_DEPTH | RADEON_STENCIL)) ) {
 
+               int tempRB3D_ZSTENCILCNTL = depth_clear->rb3d_zstencilcntl;
+
                rb3d_cntl = depth_clear->rb3d_cntl;
 
                if ( flags & RADEON_DEPTH ) {
@@ -925,6 +1084,14 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev,
                        rb3d_stencilrefmask = 0x00000000;
                }
 
+               if (flags & RADEON_USE_COMP_ZBUF) {
+                       tempRB3D_ZSTENCILCNTL |= RADEON_Z_COMPRESSION_ENABLE |
+                               RADEON_Z_DECOMPRESSION_ENABLE;
+               }
+               if (flags & RADEON_USE_HIERZ) {
+                       tempRB3D_ZSTENCILCNTL |= RADEON_Z_HIERARCHY_ENABLE;
+               }
+
                BEGIN_RING( 13 );
                RADEON_WAIT_UNTIL_2D_IDLE();
 
@@ -933,7 +1100,7 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev,
                OUT_RING( rb3d_cntl );
                
                OUT_RING_REG( RADEON_RB3D_ZSTENCILCNTL,
-                             depth_clear->rb3d_zstencilcntl );
+                             tempRB3D_ZSTENCILCNTL );
                OUT_RING_REG( RADEON_RB3D_STENCILREFMASK,
                              rb3d_stencilrefmask );
                OUT_RING_REG( RADEON_RB3D_PLANEMASK,