From: Roland Scheidegger Date: Wed, 8 Dec 2004 16:43:00 +0000 (+0000) Subject: (Stephane Marchesin, me) add hyperz support to radeon drm. Only fast z X-Git-Tag: libdrm-1_0_0~192 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c4a87c6883ede7bbf486743efe3e9325d96f8e54;p=platform%2Fupstream%2Flibdrm.git (Stephane Marchesin, me) add hyperz support to radeon drm. Only fast z clear and z buffer compression are working correctly, hierarchical-z is not. --- diff --git a/shared-core/radeon_cp.c b/shared-core/radeon_cp.c index ea061eb..3b3604b 100644 --- a/shared-core/radeon_cp.c +++ b/shared-core/radeon_cp.c @@ -2007,6 +2007,18 @@ int radeon_preinit(struct drm_device *dev, unsigned long flags) dev->dev_private = (void *)dev_priv; dev_priv->flags = flags; + switch (flags & CHIP_FAMILY_MASK) { + case CHIP_R100: + case CHIP_RV200: + case CHIP_R200: + case CHIP_R300: + dev_priv->flags |= CHIP_HAS_HIERZ; + break; + default: + /* all other chips have no hierarchical z buffer */ + break; + } + #ifdef __linux__ /* registers */ if ((ret = drm_initmap(dev, pci_resource_start(dev->pdev, 2), diff --git a/shared-core/radeon_drm.h b/shared-core/radeon_drm.h index 7c60093..78c3e61 100644 --- a/shared-core/radeon_drm.h +++ b/shared-core/radeon_drm.h @@ -144,7 +144,8 @@ #define RADEON_EMIT_PP_TEX_SIZE_1 74 #define RADEON_EMIT_PP_TEX_SIZE_2 75 #define R200_EMIT_RB3D_BLENDCOLOR 76 -#define RADEON_MAX_STATE_PACKETS 77 +#define R200_EMIT_TCL_POINT_SPRITE_CNTL 77 +#define RADEON_MAX_STATE_PACKETS 78 /* Commands understood by cmd_buffer ioctl. More can be added but * obviously these can't be removed or changed: @@ -189,6 +190,9 @@ typedef union { #define RADEON_BACK 0x2 #define RADEON_DEPTH 0x4 #define RADEON_STENCIL 0x8 +#define RADEON_CLEAR_FASTZ 0x80000000 +#define RADEON_USE_HIERZ 0x40000000 +#define RADEON_USE_COMP_ZBUF 0x20000000 /* Primitive types */ diff --git a/shared-core/radeon_drv.h b/shared-core/radeon_drv.h index 617a7ed..cd75bc1 100644 --- a/shared-core/radeon_drv.h +++ b/shared-core/radeon_drv.h @@ -42,7 +42,7 @@ #define DRIVER_NAME "radeon" #define DRIVER_DESC "ATI Radeon" -#define DRIVER_DATE "20020828" +#define DRIVER_DATE "20041207" /* Interface history: * @@ -78,10 +78,12 @@ * and GL_EXT_blend_[func|equation]_separate on r200 * 1.12- Add R300 CP microcode support - this just loads the CP on r300 * (No 3D support yet - just microcode loading). + * 1.13- Add packet R200_EMIT_TCL_POINT_SPRITE_CNTL for ARB_point_parameters + * - Add hyperz support, add hyperz flags to clear ioctl. */ #define DRIVER_MAJOR 1 -#define DRIVER_MINOR 12 +#define DRIVER_MINOR 13 #define DRIVER_PATCHLEVEL 0 enum radeon_family { @@ -117,6 +119,7 @@ enum radeon_chip_flags { CHIP_IS_IGP = 0x00020000UL, CHIP_SINGLE_CRTC = 0x00040000UL, CHIP_IS_AGP = 0x00080000UL, + CHIP_HAS_HIERZ = 0x00100000UL, }; #define GET_RING_HEAD(dev_priv) DRM_READ32( (dev_priv)->ring_rptr, 0 ) @@ -466,6 +469,7 @@ extern void radeon_driver_free_filp_priv(drm_device_t * dev, # define RADEON_STENCIL_ENABLE (1 << 7) # define RADEON_Z_ENABLE (1 << 8) #define RADEON_RB3D_DEPTHOFFSET 0x1c24 +#define RADEON_RB3D_DEPTHCLEARVALUE 0x3230 #define RADEON_RB3D_DEPTHPITCH 0x1c28 #define RADEON_RB3D_PLANEMASK 0x1d84 #define RADEON_RB3D_STENCILREFMASK 0x1d7c @@ -478,11 +482,15 @@ extern void radeon_driver_free_filp_priv(drm_device_t * dev, #define RADEON_RB3D_ZSTENCILCNTL 0x1c2c # define RADEON_Z_TEST_MASK (7 << 4) # define RADEON_Z_TEST_ALWAYS (7 << 4) +# define RADEON_Z_HIERARCHY_ENABLE (1 << 8) # define RADEON_STENCIL_TEST_ALWAYS (7 << 12) # define RADEON_STENCIL_S_FAIL_REPLACE (2 << 16) # define RADEON_STENCIL_ZPASS_REPLACE (2 << 20) # define RADEON_STENCIL_ZFAIL_REPLACE (2 << 24) +# define RADEON_Z_COMPRESSION_ENABLE (1 << 28) +# define RADEON_FORCE_Z_DIRTY (1 << 29) # define RADEON_Z_WRITE_ENABLE (1 << 30) +# define RADEON_Z_DECOMPRESSION_ENABLE (1 << 31) #define RADEON_RBBM_SOFT_RESET 0x00f0 # define RADEON_SOFT_RESET_CP (1 << 0) # define RADEON_SOFT_RESET_HI (1 << 1) @@ -590,7 +598,7 @@ extern void radeon_driver_free_filp_priv(drm_device_t * dev, # define RADEON_WAIT_3D_IDLECLEAN (1 << 17) # define RADEON_WAIT_HOST_IDLECLEAN (1 << 18) -#define RADEON_RB3D_ZMASKOFFSET 0x1c34 +#define RADEON_RB3D_ZMASKOFFSET 0x3234 #define RADEON_RB3D_ZSTENCILCNTL 0x1c2c # define RADEON_DEPTH_FORMAT_16BIT_INT_Z (0 << 0) # define RADEON_DEPTH_FORMAT_24BIT_INT_Z (2 << 0) @@ -644,6 +652,8 @@ extern void radeon_driver_free_filp_priv(drm_device_t * dev, # define RADEON_3D_DRAW_IMMD 0x00002900 # define RADEON_3D_DRAW_INDX 0x00002A00 # define RADEON_3D_LOAD_VBPNTR 0x00002F00 +# define RADEON_3D_CLEAR_ZMASK 0x00003200 +# define RADEON_3D_CLEAR_HIZ 0x00003700 # define RADEON_CNTL_HOSTDATA_BLT 0x00009400 # define RADEON_CNTL_PAINT_MULTI 0x00009A00 # define RADEON_CNTL_BITBLT_MULTI 0x00009B00 @@ -801,6 +811,8 @@ extern void radeon_driver_free_filp_priv(drm_device_t * dev, #define R200_RB3D_BLENDCOLOR 0x3218 +#define R200_SE_TCL_POINT_SPRITE_CNTL 0x22c4 + /* Constants */ #define RADEON_MAX_USEC_TIMEOUT 100000 /* 100 ms */ diff --git a/shared-core/radeon_state.c b/shared-core/radeon_state.c index ec85efa..5ec9b35 100644 --- a/shared-core/radeon_state.c +++ b/shared-core/radeon_state.c @@ -271,6 +271,7 @@ static __inline__ int radeon_check_and_fixup_packets(drm_radeon_private_t * case RADEON_EMIT_PP_TEX_SIZE_1: case RADEON_EMIT_PP_TEX_SIZE_2: case R200_EMIT_RB3D_BLENDCOLOR: + case R200_EMIT_TCL_POINT_SPRITE_CNTL: /* These packets don't contain memory offsets */ break; @@ -646,7 +647,9 @@ static struct { RADEON_PP_TEX_SIZE_0, 2, "RADEON_PP_TEX_SIZE_0"}, { RADEON_PP_TEX_SIZE_1, 2, "RADEON_PP_TEX_SIZE_1"}, { RADEON_PP_TEX_SIZE_2, 2, "RADEON_PP_TEX_SIZE_2"}, { -R200_RB3D_BLENDCOLOR, 3, "R200_RB3D_BLENDCOLOR"},}; + R200_RB3D_BLENDCOLOR, 3, "R200_RB3D_BLENDCOLOR"}, { + R200_SE_TCL_POINT_SPRITE_CNTL, 1, "R200_SE_TCL_POINT_SPRITE_CNTL"}, +}; /* ================================================================ * Performance monitoring functions @@ -858,11 +861,159 @@ static void radeon_cp_dispatch_clear(drm_device_t * dev, } } + /* hyper z clear */ + /* no docs available, based on reverse engeneering by Stephane Marchesin */ + if ((flags & (RADEON_DEPTH | RADEON_STENCIL)) && (flags & RADEON_CLEAR_FASTZ)) { + + int i; + int depthpixperline = dev_priv->depth_fmt==RADEON_DEPTH_FORMAT_16BIT_INT_Z? + (dev_priv->depth_pitch / 2): (dev_priv->depth_pitch / 4); + + u32 clearmask; + + u32 tempRB3D_DEPTHCLEARVALUE = clear->clear_depth | + ((clear->depth_mask & 0xff) << 24); + + + /* Make sure we restore the 3D state next time. + * we haven't touched any "normal" state - still need this? + */ + dev_priv->sarea_priv->ctx_owner = 0; + + if ((dev_priv->flags & CHIP_HAS_HIERZ) && (flags & RADEON_USE_HIERZ)) { + /* FIXME : reverse engineer that for Rx00 cards */ + /* FIXME : the mask supposedly contains low-res z values. So can't set + just to the max (0xff? or actually 0x3fff?), need to take z clear + value into account? */ + /* pattern seems to work for r100, though get slight + rendering errors with glxgears. If hierz is not enabled for r100, + only 4 bits which indicate clear (15,16,31,32, all zero) matter, the + other ones are ignored, and the same clear mask can be used. That's + very different behaviour than R200 which needs different clear mask + and different number of tiles to clear if hierz is enabled or not !?! + */ + clearmask = (0xff<<22)|(0xff<<6)| 0x003f003f; + } + else { + /* clear mask : chooses the clearing pattern. + rv250: could be used to clear only parts of macrotiles + (but that would get really complicated...)? + bit 0 and 1 (either or both of them ?!?!) are used to + not clear tile (or maybe one of the bits indicates if the tile is + compressed or not), bit 2 and 3 to not clear tile 1,...,. + Pattern is as follows: + | 0,1 | 4,5 | 8,9 |12,13|16,17|20,21|24,25|28,29| + bits ------------------------------------------------- + | 2,3 | 6,7 |10,11|14,15|18,19|22,23|26,27|30,31| + rv100: clearmask covers 2x8 4x1 tiles, but one clear still + covers 256 pixels ?!? + */ + clearmask = 0x0; + } + + BEGIN_RING( 8 ); + RADEON_WAIT_UNTIL_2D_IDLE(); + OUT_RING_REG( RADEON_RB3D_DEPTHCLEARVALUE, + tempRB3D_DEPTHCLEARVALUE); + /* what offset is this exactly ? */ + OUT_RING_REG( RADEON_RB3D_ZMASKOFFSET, 0 ); + /* need ctlstat, otherwise get some strange black flickering */ + OUT_RING_REG( RADEON_RB3D_ZCACHE_CTLSTAT, RADEON_RB3D_ZC_FLUSH_ALL ); + ADVANCE_RING(); + + for (i = 0; i < nbox; i++) { + int tileoffset, nrtilesx, nrtilesy, j; + /* it looks like r200 needs rv-style clears, at least if hierz is not enabled? */ + if ((dev_priv->flags&CHIP_HAS_HIERZ) && !(dev_priv->microcode_version==UCODE_R200)) { + /* FIXME : figure this out for r200 (when hierz is enabled). Or + maybe r200 actually doesn't need to put the low-res z value into + the tile cache like r100, but just needs to clear the hi-level z-buffer? + Works for R100, both with hierz and without. + R100 seems to operate on 2x1 8x8 tiles, but... + odd: offset/nrtiles need to be 64 pix (4 block) aligned? Potentially + problematic with resolutions which are not 64 pix aligned? */ + tileoffset = ((pbox[i].y1 >> 3) * depthpixperline + pbox[i].x1) >> 6; + nrtilesx = ((pbox[i].x2 & ~63) - (pbox[i].x1 & ~63)) >> 4; + nrtilesy = (pbox[i].y2 >> 3) - (pbox[i].y1 >> 3); + for (j = 0; j <= nrtilesy; j++) { + BEGIN_RING( 4 ); + OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_ZMASK, 2 ) ); + /* first tile */ + OUT_RING( tileoffset * 8 ); + /* the number of tiles to clear */ + OUT_RING( nrtilesx + 4 ); + /* clear mask : chooses the clearing pattern. */ + OUT_RING( clearmask ); + ADVANCE_RING(); + tileoffset += depthpixperline >> 6; + } + } + else if (dev_priv->microcode_version==UCODE_R200) { + /* works for rv250. */ + /* find first macro tile (8x2 4x4 z-pixels on rv250) */ + tileoffset = ((pbox[i].y1 >> 3) * depthpixperline + pbox[i].x1) >> 5; + nrtilesx = (pbox[i].x2 >> 5) - (pbox[i].x1 >> 5); + nrtilesy = (pbox[i].y2 >> 3) - (pbox[i].y1 >> 3); + for (j = 0; j <= nrtilesy; j++) { + BEGIN_RING( 4 ); + OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_ZMASK, 2 ) ); + /* first tile */ + /* judging by the first tile offset needed, could possibly + directly address/clear 4x4 tiles instead of 8x2 * 4x4 + macro tiles, though would still need clear mask for + right/bottom if truely 4x4 granularity is desired ? */ + OUT_RING( tileoffset * 16 ); + /* the number of tiles to clear */ + OUT_RING( nrtilesx + 1 ); + /* clear mask : chooses the clearing pattern. */ + OUT_RING( clearmask ); + ADVANCE_RING(); + tileoffset += depthpixperline >> 5; + } + } + else { /* rv 100 */ + /* rv100 might not need 64 pix alignment, who knows */ + /* offsets are, hmm, weird */ + tileoffset = ((pbox[i].y1 >> 4) * depthpixperline + pbox[i].x1) >> 6; + nrtilesx = ((pbox[i].x2 & ~63) - (pbox[i].x1 & ~63)) >> 4; + nrtilesy = (pbox[i].y2 >> 4) - (pbox[i].y1 >> 4); + for (j = 0; j <= nrtilesy; j++) { + BEGIN_RING( 4 ); + OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_ZMASK, 2 ) ); + OUT_RING( tileoffset * 128 ); + /* the number of tiles to clear */ + OUT_RING( nrtilesx + 4 ); + /* clear mask : chooses the clearing pattern. */ + OUT_RING( clearmask ); + ADVANCE_RING(); + tileoffset += depthpixperline >> 6; + } + } + } + + /* TODO don't always clear all hi-level z tiles */ + if ((dev_priv->flags & CHIP_HAS_HIERZ) && (dev_priv->microcode_version==UCODE_R200) + && (flags & RADEON_USE_HIERZ)) + /* r100 and cards without hierarchical z-buffer have no high-level z-buffer */ + /* FIXME : the mask supposedly contains low-res z values. So can't set + just to the max (0xff? or actually 0x3fff?), need to take z clear + value into account? */ + { + BEGIN_RING( 4 ); + OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_HIZ, 2 ) ); + OUT_RING( 0x0 ); /* First tile */ + OUT_RING( 0x3cc0 ); + OUT_RING( (0xff<<22)|(0xff<<6)| 0x003f003f); + ADVANCE_RING(); + } + } + /* We have to clear the depth and/or stencil buffers by * rendering a quad into just those buffers. Thus, we have to * make sure the 3D engine is configured correctly. */ - if ((dev_priv->microcode_version == UCODE_R200) && (flags & (RADEON_DEPTH | RADEON_STENCIL))) { + else if ((dev_priv->microcode_version == UCODE_R200) && + (flags & (RADEON_DEPTH | RADEON_STENCIL))) { int tempPP_CNTL; int tempRE_CNTL; @@ -929,6 +1080,14 @@ static void radeon_cp_dispatch_clear(drm_device_t * dev, tempRB3D_STENCILREFMASK = 0x00000000; } + if (flags & RADEON_USE_COMP_ZBUF) { + tempRB3D_ZSTENCILCNTL |= RADEON_Z_COMPRESSION_ENABLE | + RADEON_Z_DECOMPRESSION_ENABLE; + } + if (flags & RADEON_USE_HIERZ) { + tempRB3D_ZSTENCILCNTL |= RADEON_Z_HIERARCHY_ENABLE; + } + BEGIN_RING(26); RADEON_WAIT_UNTIL_2D_IDLE(); @@ -979,6 +1138,8 @@ static void radeon_cp_dispatch_clear(drm_device_t * dev, } } else if ((flags & (RADEON_DEPTH | RADEON_STENCIL))) { + int tempRB3D_ZSTENCILCNTL = depth_clear->rb3d_zstencilcntl; + rb3d_cntl = depth_clear->rb3d_cntl; if (flags & RADEON_DEPTH) { @@ -995,6 +1156,14 @@ static void radeon_cp_dispatch_clear(drm_device_t * dev, rb3d_stencilrefmask = 0x00000000; } + if (flags & RADEON_USE_COMP_ZBUF) { + tempRB3D_ZSTENCILCNTL |= RADEON_Z_COMPRESSION_ENABLE | + RADEON_Z_DECOMPRESSION_ENABLE; + } + if (flags & RADEON_USE_HIERZ) { + tempRB3D_ZSTENCILCNTL |= RADEON_Z_HIERARCHY_ENABLE; + } + BEGIN_RING(13); RADEON_WAIT_UNTIL_2D_IDLE(); @@ -1002,8 +1171,7 @@ static void radeon_cp_dispatch_clear(drm_device_t * dev, OUT_RING(0x00000000); OUT_RING(rb3d_cntl); - OUT_RING_REG(RADEON_RB3D_ZSTENCILCNTL, - depth_clear->rb3d_zstencilcntl); + OUT_RING_REG(RADEON_RB3D_ZSTENCILCNTL, tempRB3D_ZSTENCILCNTL); OUT_RING_REG(RADEON_RB3D_STENCILREFMASK, rb3d_stencilrefmask); OUT_RING_REG(RADEON_RB3D_PLANEMASK, 0x00000000); OUT_RING_REG(RADEON_SE_CNTL, depth_clear->se_cntl); diff --git a/shared/radeon.h b/shared/radeon.h index 06f5017..80bfa0c 100644 --- a/shared/radeon.h +++ b/shared/radeon.h @@ -42,10 +42,10 @@ #define DRIVER_NAME "radeon" #define DRIVER_DESC "ATI Radeon" -#define DRIVER_DATE "20020828" +#define DRIVER_DATE "20041207" #define DRIVER_MAJOR 1 -#define DRIVER_MINOR 12 +#define DRIVER_MINOR 13 #define DRIVER_PATCHLEVEL 0 /* Interface history: @@ -82,6 +82,8 @@ * and GL_EXT_blend_[func|equation]_separate on r200 * 1.12- Add R300 CP microcode support - this just loads the CP on r300 * (No 3D support yet - just microcode loading). + * 1.13- Add packet R200_EMIT_TCL_POINT_SPRITE_CNTL for ARB_point_parameters + * - Add hyperz support, add hyperz flags to clear ioctl. */ #define DRIVER_IOCTLS \ [DRM_IOCTL_NR(DRM_IOCTL_DMA)] = { radeon_cp_buffers, 1, 0 }, \ diff --git a/shared/radeon_cp.c b/shared/radeon_cp.c index 75a7bd5..5d13f47 100644 --- a/shared/radeon_cp.c +++ b/shared/radeon_cp.c @@ -2017,6 +2017,18 @@ int radeon_preinit( struct drm_device *dev, unsigned long flags ) dev->dev_private = (void *)dev_priv; dev_priv->flags = flags; + switch (flags & CHIP_FAMILY_MASK) { + case CHIP_R100: + case CHIP_RV200: + case CHIP_R200: + case CHIP_R300: + dev_priv->flags |= CHIP_HAS_HIERZ; + break; + default: + /* all other chips have no hierarchical z buffer */ + break; + } + /* registers */ if( (ret = DRM(initmap)( dev, pci_resource_start( dev->pdev, 2 ), pci_resource_len( dev->pdev, 2 ), _DRM_REGISTERS, 0 ))) diff --git a/shared/radeon_drm.h b/shared/radeon_drm.h index 14d65ea..e086938 100644 --- a/shared/radeon_drm.h +++ b/shared/radeon_drm.h @@ -145,7 +145,8 @@ #define RADEON_EMIT_PP_TEX_SIZE_1 74 #define RADEON_EMIT_PP_TEX_SIZE_2 75 #define R200_EMIT_RB3D_BLENDCOLOR 76 -#define RADEON_MAX_STATE_PACKETS 77 +#define R200_EMIT_TCL_POINT_SPRITE_CNTL 77 +#define RADEON_MAX_STATE_PACKETS 78 /* Commands understood by cmd_buffer ioctl. More can be added but @@ -193,6 +194,9 @@ typedef union { #define RADEON_BACK 0x2 #define RADEON_DEPTH 0x4 #define RADEON_STENCIL 0x8 +#define RADEON_CLEAR_FASTZ 0x80000000 +#define RADEON_USE_HIERZ 0x40000000 +#define RADEON_USE_COMP_ZBUF 0x20000000 /* Primitive types */ diff --git a/shared/radeon_drv.h b/shared/radeon_drv.h index 32a6c3f..9e0e8fe 100644 --- a/shared/radeon_drv.h +++ b/shared/radeon_drv.h @@ -68,6 +68,7 @@ enum radeon_chip_flags { CHIP_IS_IGP = 0x00020000UL, CHIP_SINGLE_CRTC = 0x00040000UL, CHIP_IS_AGP = 0x00080000UL, + CHIP_HAS_HIERZ = 0x00100000UL, }; #define GET_RING_HEAD(dev_priv) DRM_READ32( (dev_priv)->ring_rptr, 0 ) @@ -411,6 +412,7 @@ extern void radeon_driver_irq_uninstall( drm_device_t *dev ); # define RADEON_STENCIL_ENABLE (1 << 7) # define RADEON_Z_ENABLE (1 << 8) #define RADEON_RB3D_DEPTHOFFSET 0x1c24 +#define RADEON_RB3D_DEPTHCLEARVALUE 0x3230 #define RADEON_RB3D_DEPTHPITCH 0x1c28 #define RADEON_RB3D_PLANEMASK 0x1d84 #define RADEON_RB3D_STENCILREFMASK 0x1d7c @@ -423,11 +425,15 @@ extern void radeon_driver_irq_uninstall( drm_device_t *dev ); #define RADEON_RB3D_ZSTENCILCNTL 0x1c2c # define RADEON_Z_TEST_MASK (7 << 4) # define RADEON_Z_TEST_ALWAYS (7 << 4) +# define RADEON_Z_HIERARCHY_ENABLE (1 << 8) # define RADEON_STENCIL_TEST_ALWAYS (7 << 12) # define RADEON_STENCIL_S_FAIL_REPLACE (2 << 16) # define RADEON_STENCIL_ZPASS_REPLACE (2 << 20) # define RADEON_STENCIL_ZFAIL_REPLACE (2 << 24) +# define RADEON_Z_COMPRESSION_ENABLE (1 << 28) +# define RADEON_FORCE_Z_DIRTY (1 << 29) # define RADEON_Z_WRITE_ENABLE (1 << 30) +# define RADEON_Z_DECOMPRESSION_ENABLE (1 << 31) #define RADEON_RBBM_SOFT_RESET 0x00f0 # define RADEON_SOFT_RESET_CP (1 << 0) # define RADEON_SOFT_RESET_HI (1 << 1) @@ -535,7 +541,7 @@ extern void radeon_driver_irq_uninstall( drm_device_t *dev ); # define RADEON_WAIT_3D_IDLECLEAN (1 << 17) # define RADEON_WAIT_HOST_IDLECLEAN (1 << 18) -#define RADEON_RB3D_ZMASKOFFSET 0x1c34 +#define RADEON_RB3D_ZMASKOFFSET 0x3234 #define RADEON_RB3D_ZSTENCILCNTL 0x1c2c # define RADEON_DEPTH_FORMAT_16BIT_INT_Z (0 << 0) # define RADEON_DEPTH_FORMAT_24BIT_INT_Z (2 << 0) @@ -590,6 +596,8 @@ extern void radeon_driver_irq_uninstall( drm_device_t *dev ); # define RADEON_3D_DRAW_IMMD 0x00002900 # define RADEON_3D_DRAW_INDX 0x00002A00 # define RADEON_3D_LOAD_VBPNTR 0x00002F00 +# define RADEON_3D_CLEAR_ZMASK 0x00003200 +# define RADEON_3D_CLEAR_HIZ 0x00003700 # define RADEON_CNTL_HOSTDATA_BLT 0x00009400 # define RADEON_CNTL_PAINT_MULTI 0x00009A00 # define RADEON_CNTL_BITBLT_MULTI 0x00009B00 @@ -748,6 +756,8 @@ extern void radeon_driver_irq_uninstall( drm_device_t *dev ); #define R200_RB3D_BLENDCOLOR 0x3218 +#define R200_SE_TCL_POINT_SPRITE_CNTL 0x22c4 + /* Constants */ #define RADEON_MAX_USEC_TIMEOUT 100000 /* 100 ms */ diff --git a/shared/radeon_state.c b/shared/radeon_state.c index 3cafd9a..caba6a3 100644 --- a/shared/radeon_state.c +++ b/shared/radeon_state.c @@ -205,6 +205,7 @@ static __inline__ int radeon_check_and_fixup_packets( drm_radeon_private_t *dev_ case RADEON_EMIT_PP_TEX_SIZE_1: case RADEON_EMIT_PP_TEX_SIZE_2: case R200_EMIT_RB3D_BLENDCOLOR: + case R200_EMIT_TCL_POINT_SPRITE_CNTL: /* These packets don't contain memory offsets */ break; @@ -569,6 +570,7 @@ static struct { { RADEON_PP_TEX_SIZE_1, 2, "RADEON_PP_TEX_SIZE_1" }, { RADEON_PP_TEX_SIZE_2, 2, "RADEON_PP_TEX_SIZE_2" }, { R200_RB3D_BLENDCOLOR, 3, "R200_RB3D_BLENDCOLOR" }, + { R200_SE_TCL_POINT_SPRITE_CNTL, 1, "R200_SE_TCL_POINT_SPRITE_CNTL"}, }; @@ -780,12 +782,159 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev, } } + /* hyper z clear */ + /* no docs available, based on reverse engeneering by Stephane Marchesin */ + if ((flags & (RADEON_DEPTH | RADEON_STENCIL)) && (flags & RADEON_CLEAR_FASTZ)) { + + int i; + int depthpixperline = dev_priv->depth_fmt==RADEON_DEPTH_FORMAT_16BIT_INT_Z? + (dev_priv->depth_pitch / 2): (dev_priv->depth_pitch / 4); + + u32 clearmask; + + u32 tempRB3D_DEPTHCLEARVALUE = clear->clear_depth | + ((clear->depth_mask & 0xff) << 24); + + + /* Make sure we restore the 3D state next time. + * we haven't touched any "normal" state - still need this? + */ + dev_priv->sarea_priv->ctx_owner = 0; + + if ((dev_priv->flags & CHIP_HAS_HIERZ) && (flags & RADEON_USE_HIERZ)) { + /* FIXME : reverse engineer that for Rx00 cards */ + /* FIXME : the mask supposedly contains low-res z values. So can't set + just to the max (0xff? or actually 0x3fff?), need to take z clear + value into account? */ + /* pattern seems to work for r100, though get slight + rendering errors with glxgears. If hierz is not enabled for r100, + only 4 bits which indicate clear (15,16,31,32, all zero) matter, the + other ones are ignored, and the same clear mask can be used. That's + very different behaviour than R200 which needs different clear mask + and different number of tiles to clear if hierz is enabled or not !?! + */ + clearmask = (0xff<<22)|(0xff<<6)| 0x003f003f; + } + else { + /* clear mask : chooses the clearing pattern. + rv250: could be used to clear only parts of macrotiles + (but that would get really complicated...)? + bit 0 and 1 (either or both of them ?!?!) are used to + not clear tile (or maybe one of the bits indicates if the tile is + compressed or not), bit 2 and 3 to not clear tile 1,...,. + Pattern is as follows: + | 0,1 | 4,5 | 8,9 |12,13|16,17|20,21|24,25|28,29| + bits ------------------------------------------------- + | 2,3 | 6,7 |10,11|14,15|18,19|22,23|26,27|30,31| + rv100: clearmask covers 2x8 4x1 tiles, but one clear still + covers 256 pixels ?!? + */ + clearmask = 0x0; + } + + BEGIN_RING( 8 ); + RADEON_WAIT_UNTIL_2D_IDLE(); + OUT_RING_REG( RADEON_RB3D_DEPTHCLEARVALUE, + tempRB3D_DEPTHCLEARVALUE); + /* what offset is this exactly ? */ + OUT_RING_REG( RADEON_RB3D_ZMASKOFFSET, 0 ); + /* need ctlstat, otherwise get some strange black flickering */ + OUT_RING_REG( RADEON_RB3D_ZCACHE_CTLSTAT, RADEON_RB3D_ZC_FLUSH_ALL ); + ADVANCE_RING(); + + for (i = 0; i < nbox; i++) { + int tileoffset, nrtilesx, nrtilesy, j; + /* it looks like r200 needs rv-style clears, at least if hierz is not enabled? */ + if ((dev_priv->flags&CHIP_HAS_HIERZ) && !(dev_priv->microcode_version==UCODE_R200)) { + /* FIXME : figure this out for r200 (when hierz is enabled). Or + maybe r200 actually doesn't need to put the low-res z value into + the tile cache like r100, but just needs to clear the hi-level z-buffer? + Works for R100, both with hierz and without. + R100 seems to operate on 2x1 8x8 tiles, but... + odd: offset/nrtiles need to be 64 pix (4 block) aligned? Potentially + problematic with resolutions which are not 64 pix aligned? */ + tileoffset = ((pbox[i].y1 >> 3) * depthpixperline + pbox[i].x1) >> 6; + nrtilesx = ((pbox[i].x2 & ~63) - (pbox[i].x1 & ~63)) >> 4; + nrtilesy = (pbox[i].y2 >> 3) - (pbox[i].y1 >> 3); + for (j = 0; j <= nrtilesy; j++) { + BEGIN_RING( 4 ); + OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_ZMASK, 2 ) ); + /* first tile */ + OUT_RING( tileoffset * 8 ); + /* the number of tiles to clear */ + OUT_RING( nrtilesx + 4 ); + /* clear mask : chooses the clearing pattern. */ + OUT_RING( clearmask ); + ADVANCE_RING(); + tileoffset += depthpixperline >> 6; + } + } + else if (dev_priv->microcode_version==UCODE_R200) { + /* works for rv250. */ + /* find first macro tile (8x2 4x4 z-pixels on rv250) */ + tileoffset = ((pbox[i].y1 >> 3) * depthpixperline + pbox[i].x1) >> 5; + nrtilesx = (pbox[i].x2 >> 5) - (pbox[i].x1 >> 5); + nrtilesy = (pbox[i].y2 >> 3) - (pbox[i].y1 >> 3); + for (j = 0; j <= nrtilesy; j++) { + BEGIN_RING( 4 ); + OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_ZMASK, 2 ) ); + /* first tile */ + /* judging by the first tile offset needed, could possibly + directly address/clear 4x4 tiles instead of 8x2 * 4x4 + macro tiles, though would still need clear mask for + right/bottom if truely 4x4 granularity is desired ? */ + OUT_RING( tileoffset * 16 ); + /* the number of tiles to clear */ + OUT_RING( nrtilesx + 1 ); + /* clear mask : chooses the clearing pattern. */ + OUT_RING( clearmask ); + ADVANCE_RING(); + tileoffset += depthpixperline >> 5; + } + } + else { /* rv 100 */ + /* rv100 might not need 64 pix alignment, who knows */ + /* offsets are, hmm, weird */ + tileoffset = ((pbox[i].y1 >> 4) * depthpixperline + pbox[i].x1) >> 6; + nrtilesx = ((pbox[i].x2 & ~63) - (pbox[i].x1 & ~63)) >> 4; + nrtilesy = (pbox[i].y2 >> 4) - (pbox[i].y1 >> 4); + for (j = 0; j <= nrtilesy; j++) { + BEGIN_RING( 4 ); + OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_ZMASK, 2 ) ); + OUT_RING( tileoffset * 128 ); + /* the number of tiles to clear */ + OUT_RING( nrtilesx + 4 ); + /* clear mask : chooses the clearing pattern. */ + OUT_RING( clearmask ); + ADVANCE_RING(); + tileoffset += depthpixperline >> 6; + } + } + } + + /* TODO don't always clear all hi-level z tiles */ + if ((dev_priv->flags & CHIP_HAS_HIERZ) && (dev_priv->microcode_version==UCODE_R200) + && (flags & RADEON_USE_HIERZ)) + /* r100 and cards without hierarchical z-buffer have no high-level z-buffer */ + /* FIXME : the mask supposedly contains low-res z values. So can't set + just to the max (0xff? or actually 0x3fff?), need to take z clear + value into account? */ + { + BEGIN_RING( 4 ); + OUT_RING( CP_PACKET3( RADEON_3D_CLEAR_HIZ, 2 ) ); + OUT_RING( 0x0 ); /* First tile */ + OUT_RING( 0x3cc0 ); + OUT_RING( (0xff<<22)|(0xff<<6)| 0x003f003f); + ADVANCE_RING(); + } + } + /* We have to clear the depth and/or stencil buffers by * rendering a quad into just those buffers. Thus, we have to * make sure the 3D engine is configured correctly. */ - if ( (dev_priv->microcode_version==UCODE_R200) && - (flags & (RADEON_DEPTH | RADEON_STENCIL)) ) { + else if ((dev_priv->microcode_version == UCODE_R200) && + (flags & (RADEON_DEPTH | RADEON_STENCIL))) { int tempPP_CNTL; int tempRE_CNTL; @@ -855,6 +1004,14 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev, tempRB3D_STENCILREFMASK = 0x00000000; } + if (flags & RADEON_USE_COMP_ZBUF) { + tempRB3D_ZSTENCILCNTL |= RADEON_Z_COMPRESSION_ENABLE | + RADEON_Z_DECOMPRESSION_ENABLE; + } + if (flags & RADEON_USE_HIERZ) { + tempRB3D_ZSTENCILCNTL |= RADEON_Z_HIERARCHY_ENABLE; + } + BEGIN_RING( 26 ); RADEON_WAIT_UNTIL_2D_IDLE(); @@ -909,6 +1066,8 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev, } else if ( (flags & (RADEON_DEPTH | RADEON_STENCIL)) ) { + int tempRB3D_ZSTENCILCNTL = depth_clear->rb3d_zstencilcntl; + rb3d_cntl = depth_clear->rb3d_cntl; if ( flags & RADEON_DEPTH ) { @@ -925,6 +1084,14 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev, rb3d_stencilrefmask = 0x00000000; } + if (flags & RADEON_USE_COMP_ZBUF) { + tempRB3D_ZSTENCILCNTL |= RADEON_Z_COMPRESSION_ENABLE | + RADEON_Z_DECOMPRESSION_ENABLE; + } + if (flags & RADEON_USE_HIERZ) { + tempRB3D_ZSTENCILCNTL |= RADEON_Z_HIERARCHY_ENABLE; + } + BEGIN_RING( 13 ); RADEON_WAIT_UNTIL_2D_IDLE(); @@ -933,7 +1100,7 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev, OUT_RING( rb3d_cntl ); OUT_RING_REG( RADEON_RB3D_ZSTENCILCNTL, - depth_clear->rb3d_zstencilcntl ); + tempRB3D_ZSTENCILCNTL ); OUT_RING_REG( RADEON_RB3D_STENCILREFMASK, rb3d_stencilrefmask ); OUT_RING_REG( RADEON_RB3D_PLANEMASK,