From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Tue, 29 Aug 2017 16:21:32 +0000 (-0700)
Subject: i965/fs: Add support for nir_intrinsic_shuffle
X-Git-Tag: upstream/18.1.0~1082
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=90c9f29518d32a29725b114f3b16ad8c62a812ff;p=platform%2Fupstream%2Fmesa.git

i965/fs: Add support for nir_intrinsic_shuffle

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---

diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h
index 30e2e8f..3449c73 100644
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -451,6 +451,15 @@ enum opcode {
     */
    SHADER_OPCODE_BROADCAST,
 
+   /* Pick the channel from its first source register given by the index
+    * specified as second source.
+    *
+    * This is similar to the BROADCAST instruction except that it takes a
+    * dynamic index and potentially puts a different value in each output
+    * channel.
+    */
+   SHADER_OPCODE_SHUFFLE,
+
    SHADER_OPCODE_GET_BUFFER_SIZE,
 
    VEC4_OPCODE_MOV_BYTES,
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index fc4cfbf..47f1f6e 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -310,6 +310,13 @@ fs_inst::has_source_and_destination_hazard() const
    case FS_OPCODE_PACK_HALF_2x16_SPLIT:
       /* Multiple partial writes to the destination */
       return true;
+   case SHADER_OPCODE_SHUFFLE:
+      /* This instruction returns an arbitrary channel from the source and
+       * gets split into smaller instructions in the generator.  It's possible
+       * that one of the instructions will read from a channel corresponding
+       * to an earlier instruction.
+       */
+      return true;
    default:
       /* The SIMD16 compressed instruction
        *
@@ -2531,6 +2538,20 @@ fs_visitor::opt_algebraic()
          }
          break;
 
+      case SHADER_OPCODE_SHUFFLE:
+         if (is_uniform(inst->src[0])) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->sources = 1;
+            progress = true;
+         } else if (inst->src[1].file == IMM) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0] = component(inst->src[0],
+                                     inst->src[1].ud);
+            inst->sources = 1;
+            progress = true;
+         }
+         break;
+
       default:
 	 break;
       }
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index b0799a0..1b7df84 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -471,6 +471,11 @@ private:
                               struct brw_reg reg,
                               struct brw_reg indirect_byte_offset);
 
+   void generate_shuffle(fs_inst *inst,
+                         struct brw_reg dst,
+                         struct brw_reg src,
+                         struct brw_reg idx);
+
    bool patch_discard_jumps_to_fb_writes();
 
    const struct brw_compiler *compiler;
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index df34099..9b8f8ce 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -541,6 +541,106 @@ fs_generator::generate_mov_indirect(fs_inst *inst,
 }
 
 void
+fs_generator::generate_shuffle(fs_inst *inst,
+                               struct brw_reg dst,
+                               struct brw_reg src,
+                               struct brw_reg idx)
+{
+   /* Ivy bridge has some strange behavior that makes this a real pain to
+    * implement for 64-bit values so we just don't bother.
+    */
+   assert(devinfo->gen >= 8 || devinfo->is_haswell || type_sz(src.type) <= 4);
+
+   /* Because we're using the address register, we're limited to 8-wide
+    * execution on gen7.  On gen8, we're limited to 16-wide by the address
+    * register file and 8-wide for 64-bit types.  We could try and make this
+    * instruction splittable higher up in the compiler but that gets weird
+    * because it reads all of the channels regardless of execution size.  It's
+    * easier just to split it here.
+    */
+   const unsigned lower_width =
+      (devinfo->gen <= 7 || type_sz(src.type) > 4) ?
+      8 : MIN2(16, inst->exec_size);
+
+   brw_set_default_exec_size(p, cvt(lower_width) - 1);
+   for (unsigned group = 0; group < inst->exec_size; group += lower_width) {
+      brw_set_default_group(p, group);
+
+      if ((src.vstride == 0 && src.hstride == 0) ||
+          idx.file == BRW_IMMEDIATE_VALUE) {
+         /* Trivial, the source is already uniform or the index is a constant.
+          * We will typically not get here if the optimizer is doing its job,
+          * but asserting would be mean.
+          */
+         const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
+         brw_MOV(p, suboffset(dst, group), stride(suboffset(src, i), 0, 1, 0));
+      } else {
+         /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
+         struct brw_reg addr = vec8(brw_address_reg(0));
+
+         struct brw_reg group_idx = suboffset(idx, group);
+
+         if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) {
+            /* Things get grumpy if the register is too wide. */
+            group_idx.width--;
+            group_idx.vstride--;
+         }
+
+         assert(type_sz(group_idx.type) <= 4);
+         if (type_sz(group_idx.type) == 4) {
+            /* The destination stride of an instruction (in bytes) must be
+             * greater than or equal to the size of the rest of the
+             * instruction.  Since the address register is of type UW, we
+             * can't use a D-type instruction.  In order to get around this,
+             * re retype to UW and use a stride.
+             */
+            group_idx = retype(spread(group_idx, 2), BRW_REGISTER_TYPE_W);
+         }
+
+         /* Take into account the component size and horizontal stride. */
+         assert(src.vstride == src.hstride + src.width);
+         brw_SHL(p, addr, group_idx,
+                 brw_imm_uw(_mesa_logbase2(type_sz(src.type)) +
+                            src.hstride - 1));
+
+         /* Add on the register start offset */
+         brw_ADD(p, addr, addr, brw_imm_uw(src.nr * REG_SIZE + src.subnr));
+
+         if (type_sz(src.type) > 4 &&
+             ((devinfo->gen == 7 && !devinfo->is_haswell) ||
+              devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
+            /* IVB has an issue (which we found empirically) where it reads
+             * two address register components per channel for indirectly
+             * addressed 64-bit sources.
+             *
+             * From the Cherryview PRM Vol 7. "Register Region Restrictions":
+             *
+             *    "When source or destination datatype is 64b or operation is
+             *    integer DWord multiply, indirect addressing must not be
+             *    used."
+             *
+             * To work around both of these, we do two integer MOVs insead of
+             * one 64-bit MOV.  Because no double value should ever cross a
+             * register boundary, it's safe to use the immediate offset in the
+             * indirect here to handle adding 4 bytes to the offset and avoid
+             * the extra ADD to the register file.
+             */
+            struct brw_reg gdst = suboffset(dst, group);
+            struct brw_reg dst_d = retype(spread(gdst, 2),
+                                          BRW_REGISTER_TYPE_D);
+            brw_MOV(p, dst_d,
+                    retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
+            brw_MOV(p, byte_offset(dst_d, 4),
+                    retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
+         } else {
+            brw_MOV(p, suboffset(dst, group),
+                    retype(brw_VxH_indirect(0, 0), src.type));
+         }
+      }
+   }
+}
+
+void
 fs_generator::generate_urb_read(fs_inst *inst,
                                 struct brw_reg dst,
                                 struct brw_reg header)
@@ -2189,6 +2289,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          brw_broadcast(p, dst, src[0], src[1]);
          break;
 
+      case SHADER_OPCODE_SHUFFLE:
+         generate_shuffle(inst, dst, src[0], src[1]);
+         break;
+
       case FS_OPCODE_SET_SAMPLE_ID:
          generate_set_sample_id(inst, dst, src[0], src[1]);
          break;
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 71e871c..d2d32f9 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4507,6 +4507,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
+   case nir_intrinsic_shuffle: {
+      const fs_reg value = get_nir_src(instr->src[0]);
+      const fs_reg index = get_nir_src(instr->src[1]);
+
+      bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
+      break;
+   }
+
    case nir_intrinsic_first_invocation: {
       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
       bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index cbfafd4..dbad4a1 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -656,6 +656,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir)
       .lower_to_scalar = true,
       .lower_subgroup_masks = true,
       .lower_vote_trivial = !is_scalar,
+      .lower_shuffle = true,
    };
    OPT(nir_lower_subgroups, &subgroups_options);
 
diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp
index abfad4e..b1227e1 100644
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -330,6 +330,8 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
       return "find_live_channel";
    case SHADER_OPCODE_BROADCAST:
       return "broadcast";
+   case SHADER_OPCODE_SHUFFLE:
+      return "shuffle";
 
    case SHADER_OPCODE_GET_BUFFER_SIZE:
       return "get_buffer_size";