intel/nir: Lower 8-bit scan/reduce ops to 16-bit

author Jason Ekstrand <jason@jlekstrand.net>

Fri, 6 Nov 2020 05:23:07 +0000 (23:23 -0600)

committer Marge Bot <eric+marge@anholt.net>

Mon, 9 Nov 2020 18:58:51 +0000 (18:58 +0000)
author Jason Ekstrand <jason@jlekstrand.net>
Fri, 6 Nov 2020 05:23:07 +0000 (23:23 -0600)
committer Marge Bot <eric+marge@anholt.net>
Mon, 9 Nov 2020 18:58:51 +0000 (18:58 +0000)
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp

index 2cbcf4c..38d7540 100644 (file)
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -5250,28 +5250,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        opcode brw_op = brw_op_for_nir_reduction_op(redop);
        brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
  
-      /* There are a couple of register region issues that make things
-       * complicated for 8-bit types:
-       *
-       *    1. Only raw moves are allowed to write to a packed 8-bit
-       *       destination.
-       *    2. If we use a strided destination, the efficient way to do scan
-       *       operations ends up using strides that are too big to encode in
-       *       an instruction.
-       *
-       * To get around these issues, we just do all 8-bit scan operations in
-       * 16 bits.  It's actually fewer instructions than what we'd have to do
-       * if we were trying to do it in native 8-bit types and the results are
-       * the same once we truncate to 8 bits at the end.
-       */
-      brw_reg_type scan_type = src.type;
-      if (type_sz(scan_type) == 1)
-         scan_type = brw_reg_type_from_bit_size(16, src.type);
-
        /* Set up a register for all of our scratching around and initialize it
         * to reduction operation's identity value.
         */
-      fs_reg scan = bld.vgrf(scan_type);
+      fs_reg scan = bld.vgrf(src.type);
        bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
  
        bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
@@ -5314,28 +5296,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        opcode brw_op = brw_op_for_nir_reduction_op(redop);
        brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
  
-      /* There are a couple of register region issues that make things
-       * complicated for 8-bit types:
-       *
-       *    1. Only raw moves are allowed to write to a packed 8-bit
-       *       destination.
-       *    2. If we use a strided destination, the efficient way to do scan
-       *       operations ends up using strides that are too big to encode in
-       *       an instruction.
-       *
-       * To get around these issues, we just do all 8-bit scan operations in
-       * 16 bits.  It's actually fewer instructions than what we'd have to do
-       * if we were trying to do it in native 8-bit types and the results are
-       * the same once we truncate to 8 bits at the end.
-       */
-      brw_reg_type scan_type = src.type;
-      if (type_sz(scan_type) == 1)
-         scan_type = brw_reg_type_from_bit_size(16, src.type);
-
        /* Set up a register for all of our scratching around and initialize it
         * to reduction operation's identity value.
         */
-      fs_reg scan = bld.vgrf(scan_type);
+      fs_reg scan = bld.vgrf(src.type);
        const fs_builder allbld = bld.exec_all();
        allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
  
@@ -5344,7 +5308,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
            * shift of the contents before we can begin.  To make things worse,
            * we can't do this with a normal stride; we have to use indirects.
            */
-         fs_reg shifted = bld.vgrf(scan_type);
+         fs_reg shifted = bld.vgrf(src.type);
           fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
           allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
                           brw_imm_w(-1));
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c

index 71771a5..282eac3 100644 (file)
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -672,6 +672,36 @@ lower_bit_size_callback(const nir_instr *instr, UNUSED void *data)
        break;
     }
  
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_reduce:
+      case nir_intrinsic_inclusive_scan:
+      case nir_intrinsic_exclusive_scan:
+         /* There are a couple of register region issues that make things
+          * complicated for 8-bit types:
+          *
+          *    1. Only raw moves are allowed to write to a packed 8-bit
+          *       destination.
+          *    2. If we use a strided destination, the efficient way to do
+          *       scan operations ends up using strides that are too big to
+          *       encode in an instruction.
+          *
+          * To get around these issues, we just do all 8-bit scan operations
+          * in 16 bits.  It's actually fewer instructions than what we'd have
+          * to do if we were trying to do it in native 8-bit types and the
+          * results are the same once we truncate to 8 bits at the end.
+          */
+         if (intrin->dest.ssa.bit_size == 8)
+            return 16;
+         return 0;
+
+      default:
+         return 0;
+      }
+      break;
+   }
+
     default:
        return 0;
     }
author	Jason Ekstrand <jason@jlekstrand.net>
	Fri, 6 Nov 2020 05:23:07 +0000 (23:23 -0600)
committer	Marge Bot <eric+marge@anholt.net>
	Mon, 9 Nov 2020 18:58:51 +0000 (18:58 +0000)
src/intel/compiler/brw_fs_nir.cpp		patch \| blob \| history
src/intel/compiler/brw_nir.c		patch \| blob \| history