From: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Date: Tue, 23 May 2023 22:13:05 +0000 (-0400)
Subject: gallium: Return SSA values from TTN ALU helpers
X-Git-Tag: upstream/23.3.3~5765
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=a6d9f168ce1b7c78f41798e1e72dc4db385ae84a;p=platform%2Fupstream%2Fmesa.git

gallium: Return SSA values from TTN ALU helpers

This is a lot simpler!

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23089>
---

diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 665a20b8ec1..a8c0ddc5a40 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -178,26 +178,6 @@ ttn_get_depth_layout(unsigned tgsi_fs_depth_layout)
    }
 }
 
-static nir_ssa_def *
-ttn_src_for_dest(nir_builder *b, nir_alu_dest *dest)
-{
-   nir_alu_src src;
-   memset(&src, 0, sizeof(src));
-
-   if (dest->dest.is_ssa)
-      src.src = nir_src_for_ssa(&dest->dest.ssa);
-   else {
-      assert(!dest->dest.reg.indirect);
-      src.src = nir_src_for_reg(dest->dest.reg.reg);
-      src.src.reg.base_offset = dest->dest.reg.base_offset;
-   }
-
-   for (int i = 0; i < 4; i++)
-      src.swizzle[i] = i;
-
-   return nir_mov_alu(b, src, 4);
-}
-
 static enum glsl_interp_mode
 ttn_translate_interp_mode(unsigned tgsi_interp)
 {
@@ -797,55 +777,6 @@ ttn_src_for_indirect(struct ttn_compile *c, struct tgsi_ind_register *indirect)
    return nir_mov_alu(b, src, 1);
 }
 
-static nir_alu_dest
-ttn_get_dest(struct ttn_compile *c, struct tgsi_full_dst_register *tgsi_fdst)
-{
-   struct tgsi_dst_register *tgsi_dst = &tgsi_fdst->Register;
-   nir_alu_dest dest;
-   unsigned index = tgsi_dst->Index;
-
-   memset(&dest, 0, sizeof(dest));
-
-   if (tgsi_dst->File == TGSI_FILE_TEMPORARY) {
-      if (c->temp_regs[index].var) {
-          nir_register *reg;
-
-         /* this works, because TGSI will give us a base offset
-          * (in case of indirect index) that points back into
-          * the array.  Access can be direct or indirect, we
-          * don't really care.  Just create a one-shot dst reg
-          * that will get store_var'd back into the array var
-          * at the end of ttn_emit_instruction()
-          */
-         reg = nir_local_reg_create(c->build.impl);
-         reg->num_components = 4;
-         dest.dest.reg.reg = reg;
-         dest.dest.reg.base_offset = 0;
-      } else {
-         assert(!tgsi_dst->Indirect);
-         dest.dest.reg.reg = c->temp_regs[index].reg;
-         dest.dest.reg.base_offset = c->temp_regs[index].offset;
-      }
-   } else if (tgsi_dst->File == TGSI_FILE_OUTPUT) {
-      dest.dest.reg.reg = c->output_regs[index].reg;
-      dest.dest.reg.base_offset = c->output_regs[index].offset;
-   } else if (tgsi_dst->File == TGSI_FILE_ADDRESS) {
-      assert(index == 0);
-      dest.dest.reg.reg = c->addr_reg;
-   }
-
-   dest.write_mask = tgsi_dst->WriteMask;
-   dest.saturate = false;
-
-   if (tgsi_dst->Indirect && (tgsi_dst->File != TGSI_FILE_TEMPORARY)) {
-      nir_src *indirect = malloc(sizeof(nir_src));
-      *indirect = nir_src_for_ssa(ttn_src_for_indirect(c, &tgsi_fdst->Indirect));
-      dest.dest.reg.indirect = indirect;
-   }
-
-   return dest;
-}
-
 static nir_variable *
 ttn_get_var(struct ttn_compile *c, struct tgsi_full_dst_register *tgsi_fdst)
 {
@@ -930,37 +861,18 @@ ttn_get_src(struct ttn_compile *c, struct tgsi_full_src_register *tgsi_fsrc,
    return def;
 }
 
-static void
-ttn_move_dest_masked(nir_builder *b, nir_alu_dest dest,
-                     nir_ssa_def *def, unsigned write_mask)
-{
-   if (!(dest.write_mask & write_mask))
-      return;
-
-   nir_alu_instr *mov = nir_alu_instr_create(b->shader, nir_op_mov);
-   mov->dest = dest;
-   mov->dest.write_mask &= write_mask;
-   mov->src[0].src = nir_src_for_ssa(def);
-   for (unsigned i = def->num_components; i < 4; i++)
-      mov->src[0].swizzle[i] = def->num_components - 1;
-   nir_builder_instr_insert(b, &mov->instr);
-}
-
-static void
-ttn_move_dest(nir_builder *b, nir_alu_dest dest, nir_ssa_def *def)
-{
-   ttn_move_dest_masked(b, dest, def, TGSI_WRITEMASK_XYZW);
-}
-
-static void
-ttn_alu(nir_builder *b, nir_op op, nir_alu_dest dest, unsigned dest_bitsize,
-        nir_ssa_def **src)
+static nir_ssa_def *
+ttn_alu(nir_builder *b, nir_op op, unsigned dest_bitsize, nir_ssa_def **src)
 {
    nir_ssa_def *def = nir_build_alu_src_arr(b, op, src);
    if (def->bit_size == 1)
       def = nir_ineg(b, nir_b2iN(b, def, dest_bitsize));
    assert(def->bit_size == dest_bitsize);
    if (dest_bitsize == 64) {
+      /* Replicate before bitcasting, so we end up with 4x32 at the end */
+      if (def->num_components == 1)
+         def = nir_replicate(b, def, 2);
+
       if (def->num_components > 2) {
          /* 32 -> 64 bit conversion ops are supposed to only convert the first
           * two components, and we need to truncate here to avoid creating a
@@ -970,13 +882,7 @@ ttn_alu(nir_builder *b, nir_op op, nir_alu_dest dest, unsigned dest_bitsize,
       }
       def = nir_bitcast_vector(b, def, 32);
    }
-   ttn_move_dest(b, dest, def);
-}
-
-static void
-ttn_arl(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
-{
-   ttn_move_dest(b, dest, nir_f2i32(b, nir_ffloor(b, src[0])));
+   return def;
 }
 
 /* EXP - Approximate Exponential Base 2
@@ -985,17 +891,15 @@ ttn_arl(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
  *  dst.z = 2^{src.x}
  *  dst.w = 1.0
  */
-static void
-ttn_exp(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
+static nir_ssa_def *
+ttn_exp(nir_builder *b, nir_ssa_def **src)
 {
    nir_ssa_def *srcx = ttn_channel(b, src[0], X);
 
-   ttn_move_dest_masked(b, dest, nir_fexp2(b, nir_ffloor(b, srcx)),
-                        TGSI_WRITEMASK_X);
-   ttn_move_dest_masked(b, dest, nir_fsub(b, srcx, nir_ffloor(b, srcx)),
-                        TGSI_WRITEMASK_Y);
-   ttn_move_dest_masked(b, dest, nir_fexp2(b, srcx), TGSI_WRITEMASK_Z);
-   ttn_move_dest_masked(b, dest, nir_imm_float(b, 1.0), TGSI_WRITEMASK_W);
+   return nir_vec4(b, nir_fexp2(b, nir_ffloor(b, srcx)),
+                      nir_fsub(b, srcx, nir_ffloor(b, srcx)),
+                      nir_fexp2(b, srcx),
+                      nir_imm_float(b, 1.0));
 }
 
 /* LOG - Approximate Logarithm Base 2
@@ -1004,18 +908,16 @@ ttn_exp(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
  *  dst.z = \log_2{|src.x|}
  *  dst.w = 1.0
  */
-static void
-ttn_log(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
+static nir_ssa_def *
+ttn_log(nir_builder *b, nir_ssa_def **src)
 {
    nir_ssa_def *abs_srcx = nir_fabs(b, ttn_channel(b, src[0], X));
    nir_ssa_def *log2 = nir_flog2(b, abs_srcx);
 
-   ttn_move_dest_masked(b, dest, nir_ffloor(b, log2), TGSI_WRITEMASK_X);
-   ttn_move_dest_masked(b, dest,
-                        nir_fdiv(b, abs_srcx, nir_fexp2(b, nir_ffloor(b, log2))),
-                        TGSI_WRITEMASK_Y);
-   ttn_move_dest_masked(b, dest, nir_flog2(b, abs_srcx), TGSI_WRITEMASK_Z);
-   ttn_move_dest_masked(b, dest, nir_imm_float(b, 1.0), TGSI_WRITEMASK_W);
+   return nir_vec4(b, nir_ffloor(b, log2),
+                      nir_fdiv(b, abs_srcx, nir_fexp2(b, nir_ffloor(b, log2))),
+                      nir_flog2(b, abs_srcx),
+                      nir_imm_float(b, 1.0));
 }
 
 /* DST - Distance Vector
@@ -1024,13 +926,14 @@ ttn_log(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
  *   dst.z = src0.z
  *   dst.w = src1.w
  */
-static void
-ttn_dst(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
+static nir_ssa_def *
+ttn_dst(nir_builder *b, nir_ssa_def **src)
 {
-   ttn_move_dest_masked(b, dest, nir_imm_float(b, 1.0), TGSI_WRITEMASK_X);
-   ttn_move_dest_masked(b, dest, nir_fmul(b, src[0], src[1]), TGSI_WRITEMASK_Y);
-   ttn_move_dest_masked(b, dest, nir_mov(b, src[0]), TGSI_WRITEMASK_Z);
-   ttn_move_dest_masked(b, dest, nir_mov(b, src[1]), TGSI_WRITEMASK_W);
+   return nir_vec4(b, nir_imm_float(b, 1.0),
+                      nir_fmul(b, ttn_channel(b, src[0], Y),
+                                  ttn_channel(b, src[1], Y)),
+                      ttn_channel(b, src[0], Z),
+                      ttn_channel(b, src[1], W));
 }
 
 /* LIT - Light Coefficients
@@ -1039,89 +942,22 @@ ttn_dst(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
  *  dst.z = (src.x > 0.0) ? max(src.y, 0.0)^{clamp(src.w, -128.0, 128.0))} : 0
  *  dst.w = 1.0
  */
-static void
-ttn_lit(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
-{
-   ttn_move_dest_masked(b, dest, nir_imm_float(b, 1.0), TGSI_WRITEMASK_XW);
-
-   ttn_move_dest_masked(b, dest, nir_fmax(b, ttn_channel(b, src[0], X),
-                                          nir_imm_float(b, 0.0)), TGSI_WRITEMASK_Y);
-
-   if (dest.write_mask & TGSI_WRITEMASK_Z) {
-      nir_ssa_def *src0_y = ttn_channel(b, src[0], Y);
-      nir_ssa_def *wclamp = nir_fmax(b, nir_fmin(b, ttn_channel(b, src[0], W),
-                                                 nir_imm_float(b, 128.0)),
-                                     nir_imm_float(b, -128.0));
-      nir_ssa_def *pow = nir_fpow(b, nir_fmax(b, src0_y, nir_imm_float(b, 0.0)),
-                                  wclamp);
-
-      ttn_move_dest_masked(b, dest,
-                           nir_bcsel(b,
-                                     nir_flt_imm(b,
-                                                 ttn_channel(b, src[0], X),
-                                                 0.0),
-                                     nir_imm_float(b, 0.0),
-                                     pow),
-                           TGSI_WRITEMASK_Z);
-   }
-}
-
-static void
-ttn_sle(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
-{
-   ttn_move_dest(b, dest, nir_sge(b, src[1], src[0]));
-}
-
-static void
-ttn_sgt(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
-{
-   ttn_move_dest(b, dest, nir_slt(b, src[1], src[0]));
-}
-
-static void
-ttn_dp2(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
-{
-   ttn_move_dest(b, dest, nir_fdot2(b, src[0], src[1]));
-}
-
-static void
-ttn_dp3(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
-{
-   ttn_move_dest(b, dest, nir_fdot3(b, src[0], src[1]));
-}
-
-static void
-ttn_dp4(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
-{
-   ttn_move_dest(b, dest, nir_fdot4(b, src[0], src[1]));
-}
-
-static void
-ttn_umad(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
-{
-   ttn_move_dest(b, dest, nir_iadd(b, nir_imul(b, src[0], src[1]), src[2]));
-}
-
-static void
-ttn_arr(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
-{
-   ttn_move_dest(b, dest, nir_f2i32(b, nir_fround_even(b, src[0])));
-}
-
-static void
-ttn_cmp(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
-{
-   ttn_move_dest(b, dest, nir_bcsel(b,
-                                    nir_flt_imm(b, src[0], 0.0),
-                                    src[1], src[2]));
-}
-
-static void
-ttn_ucmp(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
+static nir_ssa_def *
+ttn_lit(nir_builder *b, nir_ssa_def **src)
 {
-   ttn_move_dest(b, dest, nir_bcsel(b,
-                                    nir_ine_imm(b, src[0], 0),
-                                    src[1], src[2]));
+   nir_ssa_def *src0_y = ttn_channel(b, src[0], Y);
+   nir_ssa_def *wclamp = nir_fmax(b, nir_fmin(b, ttn_channel(b, src[0], W),
+                                              nir_imm_float(b, 128.0)),
+                                  nir_imm_float(b, -128.0));
+   nir_ssa_def *pow = nir_fpow(b, nir_fmax(b, src0_y, nir_imm_float(b, 0.0)),
+                               wclamp);
+   nir_ssa_def *z = nir_bcsel(b, nir_flt_imm(b, ttn_channel(b, src[0], X), 0.0),
+                                 nir_imm_float(b, 0.0), pow);
+
+   return nir_vec4(b, nir_imm_float(b, 1.0),
+                      nir_fmax(b, ttn_channel(b, src[0], X),
+                                  nir_imm_float(b, 0.0)),
+                      z, nir_imm_float(b, 1.0));
 }
 
 static void
@@ -1131,14 +967,14 @@ ttn_barrier(nir_builder *b)
 }
 
 static void
-ttn_kill(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
+ttn_kill(nir_builder *b)
 {
    nir_discard(b);
    b->shader->info.fs.uses_discard = true;
 }
 
 static void
-ttn_kill_if(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
+ttn_kill_if(nir_builder *b, nir_ssa_def **src)
 {
    /* flt must be exact, because NaN shouldn't discard. (apps rely on this) */
    b->exact = true;
@@ -1335,8 +1171,8 @@ add_ssbo_var(struct ttn_compile *c, int binding)
    }
 }
 
-static void
-ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
+static nir_ssa_def *
+ttn_tex(struct ttn_compile *c, nir_ssa_def **src)
 {
    nir_builder *b = &c->build;
    struct tgsi_full_instruction *tgsi_inst = &c->token->FullInstruction;
@@ -1572,9 +1408,7 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
    nir_ssa_dest_init(&instr->instr, &instr->dest,
                      nir_tex_instr_dest_size(instr), 32);
    nir_builder_instr_insert(b, &instr->instr);
-
-   /* Resolve the writemask on the texture op. */
-   ttn_move_dest(b, dest, &instr->dest.ssa);
+   return nir_pad_vector_imm_int(b, &instr->dest.ssa, 0, 4);
 }
 
 /* TGSI_OPCODE_TXQ is actually two distinct operations:
@@ -1586,8 +1420,8 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
  *
  * dst.xyz map to NIR txs opcode, and dst.w maps to query_levels
  */
-static void
-ttn_txq(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
+static nir_ssa_def *
+ttn_txq(struct ttn_compile *c, nir_ssa_def **src)
 {
    nir_builder *b = &c->build;
    struct tgsi_full_instruction *tgsi_inst = &c->token->FullInstruction;
@@ -1637,8 +1471,9 @@ ttn_txq(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
    nir_ssa_dest_init(&qlv->instr, &qlv->dest, 1, 32);
    nir_builder_instr_insert(b, &qlv->instr);
 
-   ttn_move_dest_masked(b, dest, &txs->dest.ssa, TGSI_WRITEMASK_XYZ);
-   ttn_move_dest_masked(b, dest, &qlv->dest.ssa, TGSI_WRITEMASK_W);
+   return nir_vector_insert_imm(b,
+                                nir_pad_vector_imm_int(b, &txs->dest.ssa, 0, 4),
+                                &qlv->dest.ssa, 3);
 }
 
 static enum glsl_base_type
@@ -1673,8 +1508,8 @@ get_mem_qualifier(struct tgsi_full_instruction *tgsi_inst)
    return access;
 }
 
-static void
-ttn_mem(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
+static nir_ssa_def *
+ttn_mem(struct ttn_compile *c, nir_ssa_def **src)
 {
    nir_builder *b = &c->build;
    struct tgsi_full_instruction *tgsi_inst = &c->token->FullInstruction;
@@ -1794,9 +1629,10 @@ ttn_mem(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
       nir_ssa_dest_init(&instr->instr, &instr->dest, instr->num_components,
                         32);
       nir_builder_instr_insert(b, &instr->instr);
-      ttn_move_dest(b, dest, &instr->dest.ssa);
+      return nir_pad_vector_imm_int(b, &instr->dest.ssa, 0, 4);
    } else {
       nir_builder_instr_insert(b, &instr->instr);
+      return NULL;
    }
 }
 
@@ -1996,7 +1832,6 @@ ttn_emit_instruction(struct ttn_compile *c)
    for (i = 0; i < tgsi_inst->Instruction.NumSrcRegs; i++) {
       src[i] = ttn_get_src(c, &tgsi_inst->Src[i], i);
    }
-   nir_alu_dest dest = ttn_get_dest(c, tgsi_dst);
 
    unsigned tgsi_dst_type = tgsi_opcode_infer_dst_type(tgsi_op, 0);
 
@@ -2006,107 +1841,112 @@ ttn_emit_instruction(struct ttn_compile *c)
     */
    unsigned dst_bitsize = tgsi_type_is_64bit(tgsi_dst_type) ? 64 : 32;
 
+   /* If this is non-NULL after the switch, it will be written to the
+    * corresponding register/variable/etc after.
+    */
+   nir_ssa_def *dst = NULL;
+
    switch (tgsi_op) {
    case TGSI_OPCODE_RSQ:
-      ttn_move_dest(b, dest, nir_frsq(b, ttn_channel(b, src[0], X)));
+      dst = nir_frsq(b, ttn_channel(b, src[0], X));
       break;
 
    case TGSI_OPCODE_SQRT:
-      ttn_move_dest(b, dest, nir_fsqrt(b, ttn_channel(b, src[0], X)));
+      dst = nir_fsqrt(b, ttn_channel(b, src[0], X));
       break;
 
    case TGSI_OPCODE_RCP:
-      ttn_move_dest(b, dest, nir_frcp(b, ttn_channel(b, src[0], X)));
+      dst = nir_frcp(b, ttn_channel(b, src[0], X));
       break;
 
    case TGSI_OPCODE_EX2:
-      ttn_move_dest(b, dest, nir_fexp2(b, ttn_channel(b, src[0], X)));
+      dst = nir_fexp2(b, ttn_channel(b, src[0], X));
       break;
 
    case TGSI_OPCODE_LG2:
-      ttn_move_dest(b, dest, nir_flog2(b, ttn_channel(b, src[0], X)));
+      dst = nir_flog2(b, ttn_channel(b, src[0], X));
       break;
 
    case TGSI_OPCODE_POW:
-      ttn_move_dest(b, dest, nir_fpow(b,
-                                      ttn_channel(b, src[0], X),
-                                      ttn_channel(b, src[1], X)));
+      dst = nir_fpow(b, ttn_channel(b, src[0], X), ttn_channel(b, src[1], X));
       break;
 
    case TGSI_OPCODE_COS:
-      ttn_move_dest(b, dest, nir_fcos(b, ttn_channel(b, src[0], X)));
+      dst = nir_fcos(b, ttn_channel(b, src[0], X));
       break;
 
    case TGSI_OPCODE_SIN:
-      ttn_move_dest(b, dest, nir_fsin(b, ttn_channel(b, src[0], X)));
+      dst = nir_fsin(b, ttn_channel(b, src[0], X));
       break;
 
    case TGSI_OPCODE_ARL:
-      ttn_arl(b, op_trans[tgsi_op], dest, src);
+      dst = nir_f2i32(b, nir_ffloor(b, src[0]));
       break;
 
    case TGSI_OPCODE_EXP:
-      ttn_exp(b, op_trans[tgsi_op], dest, src);
+      dst = ttn_exp(b, src);
       break;
 
    case TGSI_OPCODE_LOG:
-      ttn_log(b, op_trans[tgsi_op], dest, src);
+      dst = ttn_log(b, src);
       break;
 
    case TGSI_OPCODE_DST:
-      ttn_dst(b, op_trans[tgsi_op], dest, src);
+      dst = ttn_dst(b, src);
       break;
 
    case TGSI_OPCODE_LIT:
-      ttn_lit(b, op_trans[tgsi_op], dest, src);
+      dst = ttn_lit(b, src);
       break;
 
    case TGSI_OPCODE_DP2:
-      ttn_dp2(b, op_trans[tgsi_op], dest, src);
+      dst = nir_fdot2(b, src[0], src[1]);
       break;
 
    case TGSI_OPCODE_DP3:
-      ttn_dp3(b, op_trans[tgsi_op], dest, src);
+      dst = nir_fdot3(b, src[0], src[1]);
       break;
 
    case TGSI_OPCODE_DP4:
-      ttn_dp4(b, op_trans[tgsi_op], dest, src);
+      dst = nir_fdot4(b, src[0], src[1]);
       break;
 
    case TGSI_OPCODE_UMAD:
-      ttn_umad(b, op_trans[tgsi_op], dest, src);
+      dst = nir_iadd(b, nir_imul(b, src[0], src[1]), src[2]);
       break;
 
    case TGSI_OPCODE_LRP:
-      ttn_move_dest(b, dest, nir_flrp(b, src[2], src[1], src[0]));
+      dst = nir_flrp(b, src[2], src[1], src[0]);
       break;
 
    case TGSI_OPCODE_KILL:
-      ttn_kill(b, op_trans[tgsi_op], dest, src);
+      ttn_kill(b);
       break;
 
    case TGSI_OPCODE_ARR:
-      ttn_arr(b, op_trans[tgsi_op], dest, src);
+      dst = nir_f2i32(b, nir_fround_even(b, src[0]));
       break;
 
    case TGSI_OPCODE_CMP:
-      ttn_cmp(b, op_trans[tgsi_op], dest, src);
+      dst = nir_bcsel(b, nir_flt(b, src[0], nir_imm_float(b, 0.0)),
+                      src[1], src[2]);
       break;
 
    case TGSI_OPCODE_UCMP:
-      ttn_ucmp(b, op_trans[tgsi_op], dest, src);
+      dst = nir_bcsel(b, nir_ine(b, src[0], nir_imm_int(b, 0)),
+                      src[1], src[2]);
       break;
 
    case TGSI_OPCODE_SGT:
-      ttn_sgt(b, op_trans[tgsi_op], dest, src);
+      dst = nir_slt(b, src[1], src[0]);
       break;
 
    case TGSI_OPCODE_SLE:
-      ttn_sle(b, op_trans[tgsi_op], dest, src);
+      dst = nir_sge(b, src[1], src[0]);
       break;
 
    case TGSI_OPCODE_KILL_IF:
-      ttn_kill_if(b, op_trans[tgsi_op], dest, src);
+      ttn_kill_if(b, src);
       break;
 
    case TGSI_OPCODE_TEX:
@@ -2122,16 +1962,16 @@ ttn_emit_instruction(struct ttn_compile *c)
    case TGSI_OPCODE_TXF_LZ:
    case TGSI_OPCODE_TG4:
    case TGSI_OPCODE_LODQ:
-      ttn_tex(c, dest, src);
+      dst = ttn_tex(c, src);
       break;
 
    case TGSI_OPCODE_TXQ:
-      ttn_txq(c, dest, src);
+      dst = ttn_txq(c, src);
       break;
 
    case TGSI_OPCODE_LOAD:
    case TGSI_OPCODE_STORE:
-      ttn_mem(c, dest, src);
+      dst = ttn_mem(c, src);
       break;
 
    case TGSI_OPCODE_NOP:
@@ -2175,7 +2015,7 @@ ttn_emit_instruction(struct ttn_compile *c)
 
    default:
       if (op_trans[tgsi_op] != 0 || tgsi_op == TGSI_OPCODE_MOV) {
-         ttn_alu(b, op_trans[tgsi_op], dest, dst_bitsize, src);
+         dst = ttn_alu(b, op_trans[tgsi_op], dst_bitsize, src);
       } else {
          fprintf(stderr, "unknown TGSI opcode: %s\n",
                  tgsi_get_opcode_name(tgsi_op));
@@ -2184,23 +2024,63 @@ ttn_emit_instruction(struct ttn_compile *c)
       break;
    }
 
-   if (tgsi_inst->Instruction.Saturate) {
-      assert(!dest.dest.is_ssa);
-      ttn_move_dest(b, dest, nir_fsat(b, ttn_src_for_dest(b, &dest)));
-   }
+   if (dst == NULL)
+      return;
 
-   /* if the dst has a matching var, append store_var to move
-    * output from reg to var
-    */
+   if (tgsi_inst->Instruction.Saturate)
+      dst = nir_fsat(b, dst);
+
+   if (dst->num_components == 1)
+      dst = nir_replicate(b, dst, 4);
+   else if (dst->num_components == 2)
+      dst = nir_pad_vector_imm_int(b, dst, 0, 4); /* for 64->32 conversions */
+
+   assert(dst->num_components == 4);
+
+   /* Finally, copy the SSA def to the NIR variable/register */
    nir_variable *var = ttn_get_var(c, tgsi_dst);
    if (var) {
       unsigned index = tgsi_dst->Register.Index;
       unsigned offset = c->temp_regs[index].offset;
       struct tgsi_ind_register *indirect = tgsi_dst->Register.Indirect ?
                                            &tgsi_dst->Indirect : NULL;
-      nir_src val = nir_src_for_reg(dest.dest.reg.reg);
-      nir_store_deref(b, ttn_array_deref(c, var, offset, indirect),
-                      nir_ssa_for_src(b, val, 4), dest.write_mask);
+      nir_store_deref(b, ttn_array_deref(c, var, offset, indirect), dst,
+                      tgsi_dst->Register.WriteMask);
+   } else {
+      unsigned index = tgsi_dst->Register.Index;
+      nir_register *reg = NULL;
+      unsigned base_offset = 0;
+
+      if (tgsi_dst->Register.File == TGSI_FILE_TEMPORARY) {
+         assert(!c->temp_regs[index].var && "handled above");
+         assert(!tgsi_dst->Register.Indirect);
+
+         reg = c->temp_regs[index].reg;
+         base_offset = c->temp_regs[index].offset;
+      } else if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {
+         reg = c->output_regs[index].reg;
+         base_offset = c->output_regs[index].offset;
+      } else if (tgsi_dst->Register.File == TGSI_FILE_ADDRESS) {
+         assert(index == 0);
+         reg = c->addr_reg;
+      }
+
+      nir_src *indirect = NULL;
+      if (tgsi_dst->Register.Indirect) {
+         nir_ssa_def *ind_def = ttn_src_for_indirect(c, &tgsi_dst->Indirect);
+         indirect = malloc(sizeof(nir_src));
+         *indirect = nir_src_for_ssa(ind_def);
+      }
+
+      nir_alu_instr *mov = nir_alu_instr_create(b->shader, nir_op_mov);
+      mov->dest.dest = nir_dest_for_reg(reg);
+      mov->dest.dest.reg.base_offset = base_offset;
+      mov->dest.dest.reg.indirect = indirect;
+      mov->dest.write_mask = tgsi_dst->Register.WriteMask;
+      mov->src[0].src = nir_src_for_ssa(dst);
+      for (unsigned i = dst->num_components; i < 4; i++)
+         mov->src[0].swizzle[i] = dst->num_components - 1;
+      nir_builder_instr_insert(b, &mov->instr);
    }
 }