opcode brw_op = brw_op_for_nir_reduction_op(redop);
brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
- /* There are a couple of register region issues that make things
- * complicated for 8-bit types:
- *
- * 1. Only raw moves are allowed to write to a packed 8-bit
- * destination.
- * 2. If we use a strided destination, the efficient way to do scan
- * operations ends up using strides that are too big to encode in
- * an instruction.
- *
- * To get around these issues, we just do all 8-bit scan operations in
- * 16 bits. It's actually fewer instructions than what we'd have to do
- * if we were trying to do it in native 8-bit types and the results are
- * the same once we truncate to 8 bits at the end.
- */
- brw_reg_type scan_type = src.type;
- if (type_sz(scan_type) == 1)
- scan_type = brw_reg_type_from_bit_size(16, src.type);
-
/* Set up a register for all of our scratching around and initialize it
* to reduction operation's identity value.
*/
- fs_reg scan = bld.vgrf(scan_type);
+ fs_reg scan = bld.vgrf(src.type);
bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
opcode brw_op = brw_op_for_nir_reduction_op(redop);
brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
- /* There are a couple of register region issues that make things
- * complicated for 8-bit types:
- *
- * 1. Only raw moves are allowed to write to a packed 8-bit
- * destination.
- * 2. If we use a strided destination, the efficient way to do scan
- * operations ends up using strides that are too big to encode in
- * an instruction.
- *
- * To get around these issues, we just do all 8-bit scan operations in
- * 16 bits. It's actually fewer instructions than what we'd have to do
- * if we were trying to do it in native 8-bit types and the results are
- * the same once we truncate to 8 bits at the end.
- */
- brw_reg_type scan_type = src.type;
- if (type_sz(scan_type) == 1)
- scan_type = brw_reg_type_from_bit_size(16, src.type);
-
/* Set up a register for all of our scratching around and initialize it
* to reduction operation's identity value.
*/
- fs_reg scan = bld.vgrf(scan_type);
+ fs_reg scan = bld.vgrf(src.type);
const fs_builder allbld = bld.exec_all();
allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
* shift of the contents before we can begin. To make things worse,
* we can't do this with a normal stride; we have to use indirects.
*/
- fs_reg shifted = bld.vgrf(scan_type);
+ fs_reg shifted = bld.vgrf(src.type);
fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
brw_imm_w(-1));
break;
}
+ case nir_instr_type_intrinsic: {
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ switch (intrin->intrinsic) {
+ case nir_intrinsic_reduce:
+ case nir_intrinsic_inclusive_scan:
+ case nir_intrinsic_exclusive_scan:
+ /* There are a couple of register region issues that make things
+ * complicated for 8-bit types:
+ *
+ * 1. Only raw moves are allowed to write to a packed 8-bit
+ * destination.
+ * 2. If we use a strided destination, the efficient way to do
+ * scan operations ends up using strides that are too big to
+ * encode in an instruction.
+ *
+ * To get around these issues, we just do all 8-bit scan operations
+ * in 16 bits. It's actually fewer instructions than what we'd have
+ * to do if we were trying to do it in native 8-bit types and the
+ * results are the same once we truncate to 8 bits at the end.
+ */
+ if (intrin->dest.ssa.bit_size == 8)
+ return 16;
+ return 0;
+
+ default:
+ return 0;
+ }
+ break;
+ }
+
default:
return 0;
}