From 92126cea846959bb2152905a7712753d1114bd6b Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 26 Mar 2008 10:45:32 -0700
Subject: [PATCH] cell: Implement code-gen for logic op

This also implements code-gen for the float-to-packed color
conversion.  It's currently hardcoded for A8R8G8B8, but that can
easily be fixed as soon as other color depths are supported by the
Cell driver.
---
 src/gallium/drivers/cell/common.h                  |  11 +-
 src/gallium/drivers/cell/ppu/cell_context.h        |   2 +
 src/gallium/drivers/cell/ppu/cell_state_emit.c     |  17 ++
 .../drivers/cell/ppu/cell_state_per_fragment.c     | 261 ++++++++++++++++++++-
 .../drivers/cell/ppu/cell_state_per_fragment.h     |   4 +
 src/gallium/drivers/cell/spu/spu_main.c            |  19 ++
 src/gallium/drivers/cell/spu/spu_main.h            |   9 +-
 src/gallium/drivers/cell/spu/spu_tri.c             |  59 +++--
 8 files changed, 349 insertions(+), 33 deletions(-)

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index d59e4f7..b0928fe 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -92,8 +92,9 @@
 #define CELL_CMD_STATE_BIND_VS       18
 #define CELL_CMD_STATE_BLEND         19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
-#define CELL_CMD_VS_EXECUTE          21
-#define CELL_CMD_FLUSH_BUFFER_RANGE  22
+#define CELL_CMD_STATE_LOGICOP       21
+#define CELL_CMD_VS_EXECUTE          22
+#define CELL_CMD_FLUSH_BUFFER_RANGE  23
 
 
 #define CELL_NUM_BUFFERS 4
@@ -124,6 +125,12 @@ struct cell_command_blend {
 };
 
 
+struct cell_command_logicop {
+   uint64_t base;               /**< Effective address of code start. */
+   unsigned size;               /**< Size in bytes of test code. */
+};
+
+
 /**
  * Tell SPUs about the framebuffer size, location
  */
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 9e79db0..0442abd 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -92,6 +92,8 @@ struct cell_context
    const struct cell_vertex_shader_state *vs;
    const struct cell_fragment_shader_state *fs;
 
+   struct spe_function logic_op;
+
    struct pipe_blend_color blend_color;
    struct pipe_clip_state clip;
    struct pipe_constant_buffer constants[2];
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 5709b48..5c1310d 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -50,6 +50,23 @@ emit_state_cmd(struct cell_context *cell, uint cmd,
 void
 cell_emit_state(struct cell_context *cell)
 {
+   if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_BLEND)) {
+      struct cell_command_logicop logicop;
+
+      if (cell->logic_op.store != NULL) {
+	 spe_release_func(& cell->logic_op);
+      }
+
+      cell_generate_logic_op(& cell->logic_op,
+			     & cell->blend->base,
+			     cell->framebuffer.cbufs[0]);
+
+      logicop.base = (intptr_t) cell->logic_op.store;
+      logicop.size = 64 * 4;
+      emit_state_cmd(cell, CELL_CMD_STATE_LOGICOP, &logicop,
+		     sizeof(logicop));
+   }
+
    if (cell->dirty & CELL_NEW_FRAMEBUFFER) {
       struct pipe_surface *cbuf = cell->framebuffer.cbufs[0];
       struct pipe_surface *zbuf = cell->framebuffer.zsbuf;
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
index c750b1d..f10025b 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
@@ -977,7 +977,6 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
       spe_allocate_register(f, 13),
       spe_allocate_register(f, 14),
    };
-   const int mask = spe_allocate_register(f, 15);
    unsigned func[4];
    unsigned sF[4];
    unsigned dF[4];
@@ -1114,9 +1113,6 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
                                 func[i], sF[i], dF[i],
                                 frag[i], src_factor[i],
                                 pixel[i], dst_factor[i]);
-         spe_selb(f, frag[i], pixel[i], frag[i], mask);
-      } else {
-         spe_or(f, frag[i], pixel[i], pixel[i]);
       }
    }
 
@@ -1146,3 +1142,260 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
    }
 #endif
 }
+
+
+int PC_OFFSET(const struct spe_function *f, const void *d)
+{
+   const intptr_t pc = (intptr_t) f->csr;
+   const intptr_t ea = ~0x0f & (intptr_t) d;
+
+   return (ea - pc) >> 2;
+}
+
+
+/**
+ * Generate code to perform color conversion and logic op
+ *
+ * \bug
+ * The code generated by this function should also perform dithering.
+ *
+ * \bug
+ * The code generated by this function should also perform color-write
+ * masking.
+ *
+ * \bug
+ * This routine is hard-coded to only work with ARGB8 data.
+ */
+void
+cell_generate_logic_op(struct spe_function *f, struct pipe_blend_state *blend,
+                       struct pipe_surface *surf)
+{
+   const unsigned logic_op = (blend->logicop_enable)
+       ? blend->logicop_func : PIPE_LOGICOP_COPY;
+
+   /* This code generates a maximum of 37 instructions.  An additional 32
+    * bytes (equiv. to 8 instructions) are needed for data storage.  Round up
+    * to 64 to make it a happy power-of-two.
+    */
+   spe_init_func(f, 4 * 64);
+
+
+   /* Pixel colors in framebuffer format in AoS layout.
+    */
+   const int pixel[4] = {
+      spe_allocate_register(f, 3),
+      spe_allocate_register(f, 4),
+      spe_allocate_register(f, 5),
+      spe_allocate_register(f, 6),
+   };
+
+   /* Fragment colors stored as floats in SoA layout.
+    */
+   const int frag[4] = {
+      spe_allocate_register(f, 7),
+      spe_allocate_register(f, 8),
+      spe_allocate_register(f, 9),
+      spe_allocate_register(f, 10),
+   };
+
+   const int mask = spe_allocate_register(f, 11);
+
+
+   /* Short-circuit the noop and invert cases.
+    */
+   if ((logic_op == PIPE_LOGICOP_NOOP) || (blend->colormask == 0)) {
+      spe_bi(f, 0, 0, 0);
+      return;
+   } else if (logic_op == PIPE_LOGICOP_INVERT) {
+      spe_nor(f, pixel[0], pixel[0], pixel[0]);
+      spe_nor(f, pixel[1], pixel[1], pixel[1]);
+      spe_nor(f, pixel[2], pixel[2], pixel[2]);
+      spe_nor(f, pixel[3], pixel[3], pixel[3]);
+      spe_bi(f, 0, 0, 0);
+      return;
+   }
+
+
+   const int tmp[4] = {
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+   };
+
+   const int shuf_xpose_hi = spe_allocate_available_register(f);
+   const int shuf_xpose_lo = spe_allocate_available_register(f);
+   const int shuf_color = spe_allocate_available_register(f);
+
+
+   /* Pointer to the begining of the function's private data area.
+    */
+   uint32_t *const data = ((uint32_t *) f->store) + (64 - 8);
+
+
+   /* Convert fragment colors to framebuffer format in AoS layout.
+    */
+   data[0] = 0x00010203;
+   data[1] = 0x10111213;
+   data[2] = 0x04050607;
+   data[3] = 0x14151617;
+
+   data[4] = 0x0c000408;
+   data[5] = 0x80808080;
+   data[6] = 0x80808080;
+   data[7] = 0x80808080;
+
+   spe_ilh(f, tmp[0], 0x0808);
+   spe_lqr(f, shuf_xpose_hi, PC_OFFSET(f, data+0));
+   spe_lqr(f, shuf_color, PC_OFFSET(f, data+4));
+   spe_a(f, shuf_xpose_lo, shuf_xpose_hi, tmp[0]);
+
+   spe_shufb(f, tmp[0], frag[0], frag[2], shuf_xpose_hi);
+   spe_shufb(f, tmp[1], frag[0], frag[2], shuf_xpose_lo);
+   spe_shufb(f, tmp[2], frag[1], frag[3], shuf_xpose_hi);
+   spe_shufb(f, tmp[3], frag[1], frag[3], shuf_xpose_lo);
+
+   spe_shufb(f, frag[0], tmp[0], tmp[2], shuf_xpose_hi);
+   spe_shufb(f, frag[1], tmp[0], tmp[2], shuf_xpose_lo);
+   spe_shufb(f, frag[2], tmp[1], tmp[3], shuf_xpose_hi);
+   spe_shufb(f, frag[3], tmp[1], tmp[3], shuf_xpose_lo);
+
+   spe_cfltu(f, frag[0], frag[0], 32);
+   spe_cfltu(f, frag[1], frag[1], 32);
+   spe_cfltu(f, frag[2], frag[2], 32);
+   spe_cfltu(f, frag[3], frag[3], 32);
+
+   spe_shufb(f, frag[0], frag[0], pixel[0], shuf_color);
+   spe_shufb(f, frag[1], frag[1], pixel[1], shuf_color);
+   spe_shufb(f, frag[2], frag[2], pixel[2], shuf_color);
+   spe_shufb(f, frag[3], frag[3], pixel[3], shuf_color);
+
+
+   /* If logic op is enabled, perform the requested logical operation on the
+    * converted fragment colors and the pixel colors.
+    */
+   switch (logic_op) {
+   case PIPE_LOGICOP_CLEAR:
+      spe_il(f, frag[0], 0);
+      spe_il(f, frag[1], 0);
+      spe_il(f, frag[2], 0);
+      spe_il(f, frag[3], 0);
+      break;
+   case PIPE_LOGICOP_NOR:
+      spe_nor(f, frag[0], frag[0], pixel[0]);
+      spe_nor(f, frag[1], frag[1], pixel[1]);
+      spe_nor(f, frag[2], frag[2], pixel[2]);
+      spe_nor(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_AND_INVERTED:
+      spe_andc(f, frag[0], pixel[0], frag[0]);
+      spe_andc(f, frag[1], pixel[1], frag[1]);
+      spe_andc(f, frag[2], pixel[2], frag[2]);
+      spe_andc(f, frag[3], pixel[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      spe_nor(f, frag[0], frag[0], frag[0]);
+      spe_nor(f, frag[1], frag[1], frag[1]);
+      spe_nor(f, frag[2], frag[2], frag[2]);
+      spe_nor(f, frag[3], frag[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_AND_REVERSE:
+      spe_andc(f, frag[0], frag[0], pixel[0]);
+      spe_andc(f, frag[1], frag[1], pixel[1]);
+      spe_andc(f, frag[2], frag[2], pixel[2]);
+      spe_andc(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_XOR:
+      spe_xor(f, frag[0], frag[0], pixel[0]);
+      spe_xor(f, frag[1], frag[1], pixel[1]);
+      spe_xor(f, frag[2], frag[2], pixel[2]);
+      spe_xor(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_NAND:
+      spe_nand(f, frag[0], frag[0], pixel[0]);
+      spe_nand(f, frag[1], frag[1], pixel[1]);
+      spe_nand(f, frag[2], frag[2], pixel[2]);
+      spe_nand(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_AND:
+      spe_and(f, frag[0], frag[0], pixel[0]);
+      spe_and(f, frag[1], frag[1], pixel[1]);
+      spe_and(f, frag[2], frag[2], pixel[2]);
+      spe_and(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_EQUIV:
+      spe_eqv(f, frag[0], frag[0], pixel[0]);
+      spe_eqv(f, frag[1], frag[1], pixel[1]);
+      spe_eqv(f, frag[2], frag[2], pixel[2]);
+      spe_eqv(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_OR_INVERTED:
+      spe_orc(f, frag[0], pixel[0], frag[0]);
+      spe_orc(f, frag[1], pixel[1], frag[1]);
+      spe_orc(f, frag[2], pixel[2], frag[2]);
+      spe_orc(f, frag[3], pixel[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_COPY:
+      break;
+   case PIPE_LOGICOP_OR_REVERSE:
+      spe_orc(f, frag[0], frag[0], pixel[0]);
+      spe_orc(f, frag[1], frag[1], pixel[1]);
+      spe_orc(f, frag[2], frag[2], pixel[2]);
+      spe_orc(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_OR:
+      spe_or(f, frag[0], frag[0], pixel[0]);
+      spe_or(f, frag[1], frag[1], pixel[1]);
+      spe_or(f, frag[2], frag[2], pixel[2]);
+      spe_or(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_SET:
+      spe_il(f, frag[0], ~0);
+      spe_il(f, frag[1], ~0);
+      spe_il(f, frag[2], ~0);
+      spe_il(f, frag[3], ~0);
+      break;
+
+   /* These two cases are short-circuited above.
+    */
+   case PIPE_LOGICOP_INVERT:
+   case PIPE_LOGICOP_NOOP:
+   default:
+      assert(0);
+   }
+
+
+   /* Apply fragment mask.
+    */
+   spe_ilh(f, tmp[0], 0x0000);
+   spe_ilh(f, tmp[1], 0x0404);
+   spe_ilh(f, tmp[2], 0x0808);
+   spe_ilh(f, tmp[3], 0x0c0c);
+
+   spe_shufb(f, tmp[0], mask, mask, tmp[0]);
+   spe_shufb(f, tmp[1], mask, mask, tmp[1]);
+   spe_shufb(f, tmp[2], mask, mask, tmp[2]);
+   spe_shufb(f, tmp[3], mask, mask, tmp[3]);
+
+   spe_selb(f, pixel[0], pixel[0], frag[0], tmp[0]);
+   spe_selb(f, pixel[1], pixel[1], frag[1], tmp[1]);
+   spe_selb(f, pixel[2], pixel[2], frag[2], tmp[2]);
+   spe_selb(f, pixel[3], pixel[3], frag[3], tmp[3]);
+
+   spe_bi(f, 0, 0, 0);
+
+#if 0
+   {
+      const uint32_t *p = f->store;
+      unsigned i;
+
+      printf("# %u instructions\n", f->csr - f->store);
+
+      printf("\t.text\n");
+      for (i = 0; i < 64; i++) {
+         printf("\t.long\t0x%04x\n", p[i]);
+      }
+      fflush(stdout);
+   }
+#endif
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
index f699247..ab4de96 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
@@ -31,4 +31,8 @@ cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa);
 extern void
 cell_generate_alpha_blend(struct cell_blend_state *cb);
 
+extern void
+cell_generate_logic_op(struct spe_function *f, struct pipe_blend_state *blend,
+    struct pipe_surface *surf);
+
 #endif /* CELL_STATE_PER_FRAGMENT_H */
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 0a490ab..fccff01 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -64,6 +64,9 @@ static unsigned char depth_stencil_code_buffer[4 * 64]
 static unsigned char fb_blend_code_buffer[4 * 64]
     ALIGN16_ATTRIB;
 
+static unsigned char logicop_code_buffer[4 * 64]
+    ALIGN16_ATTRIB;
+
 
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
@@ -513,6 +516,22 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
          break;
       }
+      case CELL_CMD_STATE_LOGICOP: {
+         struct cell_command_logicop *code =
+             (struct cell_command_logicop *) &buffer[pos+1];
+
+              mfc_get(logicop_code_buffer,
+                      (unsigned int) code->base,  /* src */
+                      code->size,
+                      TAG_BATCH_BUFFER,
+                      0, /* tid */
+                      0  /* rid */);
+         wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+	 spu.logicop = (logicop_func) logicop_code_buffer;
+         pos += (1 + ROUNDUP8(sizeof(struct cell_command_logicop)) / 8);
+         break;
+      }
       case CELL_CMD_FLUSH_BUFFER_RANGE: {
 	 struct cell_buffer_range *br = (struct cell_buffer_range *)
 	     &buffer[pos+1];
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 49f5d99..c204529 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -77,9 +77,14 @@ struct spu_blend_results {
 typedef struct spu_blend_results (*blend_func)(
     qword frag_r, qword frag_g, qword frag_b, qword frag_a,
     qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
-    qword const_r, qword const_g, qword const_b, qword const_a,
+    qword const_r, qword const_g, qword const_b, qword const_a);
+
+typedef struct spu_blend_results (*logicop_func)(
+    qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
+    qword frag_r, qword frag_g, qword frag_b, qword frag_a,
     qword frag_mask);
 
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
@@ -111,6 +116,8 @@ struct spu_global
    blend_func blend;
    qword const_blend_color[4] ALIGN16_ATTRIB;
 
+   logicop_func logicop;
+
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct cell_command_texture texture;
 
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index e6a1ce0..95c629a 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -305,7 +305,6 @@ emit_quad( int x, int y, mask_t mask )
    if (spu_extract(spu_orx(mask), 0)) {
       const int ix = x - setup.cliprect_minx;
       const int iy = y - setup.cliprect_miny;
-      const vector unsigned char shuffle = spu.color_shuffle;
       vector float colors[4];
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
@@ -330,45 +329,53 @@ emit_quad( int x, int y, mask_t mask )
       }
 
 
+      /* Convert fragment data from AoS to SoA format.
+       */
+      qword soa_frag[4];
+      _transpose_matrix4x4((vec_float4 *) soa_frag, colors);
+
       /* Read the current framebuffer values.
-       *
-       * Ignore read_fb for now.  In the future we can use this to avoid
-       * reading the framebuffer if read_fb is false and the fragment mask is
-       * all 0xffffffff.  This is the common case, so it is probably worth
-       * the effort.  We'll have to profile to determine whether or not the
-       * extra conditional branches hurt overall performance.
        */
-      vec_float4 aos_pix[4] = {
-         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]),
-         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+1]),
-         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+0]),
-         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+1]),
+      const qword pix[4] = {
+         (qword) spu_splats(spu.ctile.ui[iy+0][ix+0]),
+         (qword) spu_splats(spu.ctile.ui[iy+0][ix+1]),
+         (qword) spu_splats(spu.ctile.ui[iy+1][ix+0]),
+         (qword) spu_splats(spu.ctile.ui[iy+1][ix+1]),
       };
 
       qword soa_pix[4];
-      qword soa_frag[4];
 
-      /* Convert pixel and fragment data from AoS to SoA format.
-       */
-      _transpose_matrix4x4((vec_float4 *) soa_pix, aos_pix);
-      _transpose_matrix4x4((vec_float4 *) soa_frag, colors);
+      if (spu.read_fb) {
+         /* Convert pixel data from AoS to SoA format.
+          */
+         vec_float4 aos_pix[4] = {
+            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]),
+            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+1]),
+            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+0]),
+            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+1]),
+         };
+
+         _transpose_matrix4x4((vec_float4 *) soa_pix, aos_pix);
+      }
 
-      const struct spu_blend_results result =
+
+      struct spu_blend_results result =
           (*spu.blend)(soa_frag[0], soa_frag[1], soa_frag[2], soa_frag[3],
                        soa_pix[0], soa_pix[1], soa_pix[2], soa_pix[3],
                        spu.const_blend_color[0], spu.const_blend_color[1],
-                       spu.const_blend_color[2], spu.const_blend_color[3],
-                       (qword) mask);
+                       spu.const_blend_color[2], spu.const_blend_color[3]);
 
 
       /* Convert final pixel data from SoA to AoS format.
        */
-      _transpose_matrix4x4(aos_pix, (const vec_float4 *) &result);
-
-      spu.ctile.ui[iy+0][ix+0] = spu_pack_color_shuffle(aos_pix[0], shuffle);
-      spu.ctile.ui[iy+0][ix+1] = spu_pack_color_shuffle(aos_pix[1], shuffle);
-      spu.ctile.ui[iy+1][ix+0] = spu_pack_color_shuffle(aos_pix[2], shuffle);
-      spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(aos_pix[3], shuffle);
+      result = (*spu.logicop)(pix[0], pix[1], pix[2], pix[3],
+                              result.r, result.g, result.b, result.a,
+                              (qword) mask);
+
+      spu.ctile.ui[iy+0][ix+0] = spu_extract((vec_uint4) result.r, 0);
+      spu.ctile.ui[iy+0][ix+1] = spu_extract((vec_uint4) result.g, 0);
+      spu.ctile.ui[iy+1][ix+0] = spu_extract((vec_uint4) result.b, 0);
+      spu.ctile.ui[iy+1][ix+1] = spu_extract((vec_uint4) result.a, 0);
    }
 #endif
 }
-- 
2.7.4