From 59be082909de6021ec7d08476253bd4c9920e137 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 13:45:58 -0700
Subject: [PATCH] Cell: implement Z16 and Z32 testing with SIMD instructions.

---
 src/mesa/pipe/cell/spu/spu_tile.h  |   3 +-
 src/mesa/pipe/cell/spu/spu_tri.c   | 222 +++++--------------------------------
 src/mesa/pipe/cell/spu/spu_ztest.h | 135 ++++++++++++++++++++++
 3 files changed, 163 insertions(+), 197 deletions(-)
 create mode 100644 src/mesa/pipe/cell/spu/spu_ztest.h

diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h
index 1f123a2..4b1ef2a 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.h
+++ b/src/mesa/pipe/cell/spu/spu_tile.h
@@ -42,7 +42,8 @@
 typedef union {
    ushort t16[TILE_SIZE][TILE_SIZE];
    uint   t32[TILE_SIZE][TILE_SIZE];
-   float4 f4[TILE_SIZE/2][TILE_SIZE/2];
+   vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
+   vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
 } tile_t;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index a32878d..a26a4f0 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -39,18 +39,11 @@
 #include "spu_tile.h"
 #include "spu_tri.h"
 
+#include "spu_ztest.h"
 
-/*
- * If SIMD_Z=1 the Z buffer is floating point and we use vector instructions
- * to do Z testing/updating.
- */
-#define SIMD_Z 0
 
-#if SIMD_Z
+/** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
 typedef vector unsigned int mask_t;
-#else
-typedef uint mask_t;
-#endif
 
 
 /**
@@ -282,20 +275,11 @@ pack_colors(uint uicolors[4], const float4 fcolors[4])
 }
 
 
-
-static unsigned int
-do_depth_test(int x, int y, unsigned int mask)
+static INLINE mask_t
+do_depth_test(int x, int y, mask_t quadmask)
 {
-   static const float4 zscale16
-      = {.f={65535.0, 65535.0, 65535.0, 65535.0}};
-   static const float4 zscale32
-      = {.f={(float)0xffffffff,
-             (float)0xffffffff,
-             (float)0xffffffff,
-             (float)0xffffffff}};
-   int ix = x - setup.cliprect_minx;
-   int iy = y - setup.cliprect_miny;
    float4 zvals;
+   mask_t mask;
 
    zvals.v = eval_z((float) x, (float) y);
 
@@ -305,129 +289,20 @@ do_depth_test(int x, int y, unsigned int mask)
       cur_tile_status_z = TILE_STATUS_DIRTY;
    }
 
-#if 0
-   if (cur_tile_status_z == TILE_STATUS_CLEAR) {
-      /* now, _really_ clear the tile */
-      clear_z_tile(&ztile);
-   }
-   else if (cur_tile_status_z != TILE_STATUS_DIRTY) {
-      /* make sure we've got the tile from main mem */
-      wait_on_mask(1 << TAG_READ_TILE_Z);
-   }
-   cur_tile_status_z = TILE_STATUS_DIRTY;
-#endif
-
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
-      zvals.v = spu_mul(zvals.v, zscale16.v);
-      if (mask & MASK_TOP_LEFT) {
-         uint z = (uint) zvals.f[0];
-         if (z < ztile.t16[iy][ix])
-            ztile.t16[iy][ix] = z;
-         else
-            mask &= ~MASK_TOP_LEFT;
-      }
-
-      if (mask & MASK_TOP_RIGHT) {
-         uint z = (uint) zvals.f[1];
-         if (z < ztile.t16[iy][ix+1])
-            ztile.t16[iy][ix+1] = z;
-         else
-            mask &= ~MASK_TOP_RIGHT;
-      }
-
-      if (mask & MASK_BOTTOM_LEFT) {
-         uint z = (uint) zvals.f[2];
-         if (z < ztile.t16[iy+1][ix])
-            ztile.t16[iy+1][ix] = z;
-         else
-            mask &= ~MASK_BOTTOM_LEFT;
-      }
-
-      if (mask & MASK_BOTTOM_RIGHT) {
-         uint z = (uint) zvals.f[3];
-         if (z < ztile.t16[iy+1][ix+1])
-            ztile.t16[iy+1][ix+1] = z;
-         else
-            mask &= ~MASK_BOTTOM_RIGHT;
-      }
+      int ix = (x - setup.cliprect_minx) / 4;
+      int iy = (y - setup.cliprect_miny) / 2;
+      mask = spu_z16_test_less(zvals.v, &ztile.us8[iy][ix], x>>1, quadmask);
    }
    else {
-      zvals.v = spu_mul(zvals.v, zscale32.v);
-      ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM);
-      if (mask & MASK_TOP_LEFT) {
-         uint z = (uint) zvals.f[0];
-         if (z < ztile.t32[iy][ix])
-            ztile.t32[iy][ix] = z;
-         else
-            mask &= ~MASK_TOP_LEFT;
-      }
-
-      if (mask & MASK_TOP_RIGHT) {
-         uint z = (uint) zvals.f[1];
-         if (z < ztile.t32[iy][ix+1])
-            ztile.t32[iy][ix+1] = z;
-         else
-            mask &= ~MASK_TOP_RIGHT;
-      }
-
-      if (mask & MASK_BOTTOM_LEFT) {
-         uint z = (uint) zvals.f[2];
-         if (z < ztile.t32[iy+1][ix])
-            ztile.t32[iy+1][ix] = z;
-         else
-            mask &= ~MASK_BOTTOM_LEFT;
-      }
-
-      if (mask & MASK_BOTTOM_RIGHT) {
-         uint z = (uint) zvals.f[3];
-         if (z < ztile.t32[iy+1][ix+1])
-            ztile.t32[iy+1][ix+1] = z;
-         else
-            mask &= ~MASK_BOTTOM_RIGHT;
-      }
+      int ix = (x - setup.cliprect_minx) / 2;
+      int iy = (y - setup.cliprect_miny) / 2;
+      mask = spu_z32_test_less(zvals.v, &ztile.ui4[iy][ix], quadmask);
    }
-
-   if (mask)
-      cur_tile_status_z = TILE_STATUS_DIRTY;
-
    return mask;
 }
 
 
-
-
-static vector unsigned int
-do_depth_test_simd(int x, int y, vector unsigned int quadmask)
-{
-   int ix = (x - setup.cliprect_minx) / 2;
-   int iy = (y - setup.cliprect_miny) / 2;
-   float4 zvals;
-
-   vector unsigned int zmask;
-
-   zvals.v = eval_z((float) x, (float) y);
-
-   if (cur_tile_status_z == TILE_STATUS_CLEAR) {
-      /* now, _really_ clear the tile */
-      clear_z_tile(&ztile);
-   }
-   else if (cur_tile_status_z != TILE_STATUS_DIRTY) {
-      /* make sure we've got the tile from main mem */
-      wait_on_mask(1 << TAG_READ_TILE_Z);
-   }
-   cur_tile_status_z = TILE_STATUS_DIRTY;
-
-   /* XXX fetch Z value sooner to hide latency here */
-   zmask = spu_cmpgt(ztile.f4[ix][iy].v, zvals.v);
-   zmask = spu_and(zmask, quadmask);
-
-   ztile.f4[ix][iy].v = spu_sel(ztile.f4[ix][iy].v, zvals.v, zmask);
-   //ztile.f4[ix][iy].v = spu_sel(zvals.v, ztile.f4[ix][iy].v, mask4);
-
-   return zmask;
-}
-
-
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
  */
@@ -461,36 +336,18 @@ emit_quad( int x, int y, mask_t mask )
    }
 
    if (spu.depth_stencil.depth.enabled) {
-#if SIMD_Z
-      mask = do_depth_test_simd(x, y, mask);
-#else
       mask = do_depth_test(x, y, mask);
-#endif
    }
 
-#if !SIMD_Z
-   if (mask)
-#endif
-   {
-      if (cur_tile_status_c == TILE_STATUS_CLEAR) {
-         /* now, _really_ clear the tile */
-         clear_c_tile(&ctile);
-      }
+   /* If any bits in mask are set... */
+   if (spu_extract(spu_orx(mask), 0)) {
 
-#if 0
       if (cur_tile_status_c == TILE_STATUS_CLEAR) {
          /* now, _really_ clear the tile */
          clear_c_tile(&ctile);
-         cur_tile_status_c = TILE_STATUS_DIRTY;
       }
-      else if (cur_tile_status_c != TILE_STATUS_DIRTY) {
-         /* make sure we've got the tile from main mem */
-         wait_on_mask(1 << TAG_READ_TILE_COLOR);
-      }
-#endif
       cur_tile_status_c = TILE_STATUS_DIRTY;
 
-#if SIMD_Z
       if (spu_extract(mask, 0))
          ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
       if (spu_extract(mask, 1))
@@ -499,20 +356,11 @@ emit_quad( int x, int y, mask_t mask )
          ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
       if (spu_extract(mask, 3))
          ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
-#elif 0
+
+#if 0
       /* SIMD_Z with swizzled color buffer (someday) */
       vector float icolors = *((vector float *) &colors);
       ctile.f4[iy/2][ix/2].v = spu_sel(ctile.f4[iy/2][ix/2].v, icolors, mask);
-
-#else
-      if (mask & MASK_TOP_LEFT)
-         ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
-      if (mask & MASK_TOP_RIGHT)
-         ctile.t32[iy][ix+1] = colors[QUAD_TOP_RIGHT];
-      if (mask & MASK_BOTTOM_LEFT)
-         ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
-      if (mask & MASK_BOTTOM_RIGHT)
-         ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
 #endif
    }
 
@@ -533,38 +381,20 @@ static INLINE int block( int x )
 /**
  * Compute mask which indicates which pixels in the 2x2 quad are actually inside
  * the triangle's bounds.
- *
- * this is pretty nasty...  may need to rework flush_spans again to
- * fix it, if possible.
+ * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
  */
-static mask_t calculate_mask( int x )
+static INLINE mask_t calculate_mask( int x )
 {
-#if SIMD_Z
-   uint m0, m1, m2, m3;
-
-   m0 = (x >= setup.span.left[0] && x < setup.span.right[0]) * ~0;
-   m1 = (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0]) * ~0;
-   m2 = (x >= setup.span.left[1] && x < setup.span.right[1]) * ~0;
-   m3 = (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1]) * ~0;
-
-   return (vector unsigned int) {m0, m1, m2, m3};
-#else
-   unsigned mask = 0x0;
-
-   if (x >= setup.span.left[0] && x < setup.span.right[0]) 
-      mask |= MASK_TOP_LEFT;
-
-   if (x >= setup.span.left[1] && x < setup.span.right[1]) 
-      mask |= MASK_BOTTOM_LEFT;
-      
-   if (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0]) 
-      mask |= MASK_TOP_RIGHT;
-
-   if (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1]) 
-      mask |= MASK_BOTTOM_RIGHT;
-
+   /* This is a little tricky.
+    * Use & instead of && to avoid branches.
+    * Use negation to convert true/false to ~0/0 values.
+    */
+   mask_t mask;
+   mask = spu_insert(-((x   >= setup.span.left[0]) & (x   < setup.span.right[0])), mask, 0);
+   mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1);
+   mask = spu_insert(-((x   >= setup.span.left[1]) & (x   < setup.span.right[1])), mask, 2);
+   mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3);
    return mask;
-#endif
 }
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_ztest.h b/src/mesa/pipe/cell/spu/spu_ztest.h
new file mode 100644
index 0000000..5fefb15
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_ztest.h
@@ -0,0 +1,135 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * Zbuffer/depth test code.
+ */
+
+
+#ifndef SPU_ZTEST_H
+#define SPU_ZTEST_H
+
+
+#ifdef __SPU__
+#include <spu_intrinsics.h>
+#endif
+
+
+
+/**
+ * Perform Z testing for a 16-bit/value Z buffer.
+ *
+ * \param zvals  vector of four fragment zvalues as floats
+ * \param zbuf   ptr to vector of ushort[8] zbuffer values.  Note that this
+ *               contains the Z values for 2 quads, 8 pixels.
+ * \param x      x coordinate of quad (only lsbit is significant)
+ * \param inMask indicates which fragments in the quad are alive
+ * \return new mask indicating which fragments are alive after ztest
+ */
+static INLINE vector unsigned int
+spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
+                  uint x, vector unsigned int inMask)
+{
+#define ZERO 0x80
+   vector unsigned int zvals_ui4, zbuf_ui4, mask;
+
+   /* convert floats to uints in [0, 65535] */
+   zvals_ui4 = spu_convtu(zvals, 32); /* convert to [0, 2^32] */
+   zvals_ui4 = spu_rlmask(zvals_ui4, -16);  /* right shift 16 */
+
+   /* XXX this conditional could be removed with a bit of work */
+   if (x & 1) {
+      /* convert zbuffer values from ushorts to uints */
+      /* gather lower four ushorts */
+      zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
+                             (vector unsigned int) *zbuf,
+                             VEC_LITERAL(vector unsigned char,
+                                      ZERO, ZERO,  8,  9, ZERO, ZERO, 10, 11,
+                                      ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15));
+      /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
+      mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
+      /* mask &= inMask */
+      mask = spu_and(mask, inMask);
+      /* zbuf = mask ? zval : zbuf */
+      zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
+      /* convert zbuffer values from uints back to ushorts, preserve lower 4 */
+      *zbuf = (vector unsigned short)
+         spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
+                     VEC_LITERAL(vector unsigned char,
+                                 16, 17, 18, 19, 20, 21, 22, 23,
+                                 2, 3, 6, 7, 10, 11, 14, 15));
+   }
+   else {
+      /* convert zbuffer values from ushorts to uints */
+      /* gather upper four ushorts */
+      zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
+                             (vector unsigned int) *zbuf,
+                             VEC_LITERAL(vector unsigned char,
+                                         ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
+                                         ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7));
+      /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
+      mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
+      /* mask &= inMask */
+      mask = spu_and(mask, inMask);
+      /* zbuf = mask ? zval : zbuf */
+      zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
+      /* convert zbuffer values from uints back to ushorts, preserve upper 4 */
+      *zbuf = (vector unsigned short)
+         spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
+                     VEC_LITERAL(vector unsigned char,
+                                 2, 3, 6, 7, 10, 11, 14, 15,
+                                 24, 25, 26, 27, 28, 29, 30, 31));
+   }
+   return mask;
+#undef ZERO
+}
+
+
+/**
+ * As above, but Zbuffer values as 32-bit uints
+ */
+static INLINE vector unsigned int
+spu_z32_test_less(vector float zvals, vector unsigned int *zbuf_ptr,
+                  vector unsigned int inMask)
+{
+   vector unsigned int zvals_ui4, mask, zbuf = *zbuf_ptr;
+
+   /* convert floats to uints in [0, 0xffffffff] */
+   zvals_ui4 = spu_convtu(zvals, 32);
+   /* mask = (zbuf < zvals_ui4) ? ~0 : 0 */
+   mask = spu_cmpgt(zbuf, zvals_ui4);
+   /* mask &= inMask */
+   mask = spu_and(mask, inMask);
+   /* zbuf = mask ? zval : zbuf */
+   *zbuf_ptr = spu_sel(zbuf, zvals_ui4, mask);
+
+   return mask;
+}
+
+
+#endif /* SPU_ZTEST_H */
-- 
2.7.4