static bool
try_constant_propagate(const brw_compiler *compiler, fs_inst *inst,
- acp_entry *entry)
+ acp_entry *entry, int arg)
{
const struct intel_device_info *devinfo = compiler->devinfo;
bool progress = false;
if (type_sz(entry->src.type) > 4)
return false;
- for (int i = inst->sources - 1; i >= 0; i--) {
- if (inst->src[i].file != VGRF)
- continue;
+ if (inst->src[arg].file != VGRF)
+ return false;
- assert(entry->dst.file == VGRF);
- if (inst->src[i].nr != entry->dst.nr)
- continue;
+ assert(entry->dst.file == VGRF);
+ if (inst->src[arg].nr != entry->dst.nr)
+ return false;
- /* Bail if inst is reading a range that isn't contained in the range
- * that entry is writing.
- */
- if (!region_contained_in(inst->src[i], inst->size_read(i),
- entry->dst, entry->size_written))
- continue;
+ /* Bail if inst is reading a range that isn't contained in the range
+ * that entry is writing.
+ */
+ if (!region_contained_in(inst->src[arg], inst->size_read(arg),
+ entry->dst, entry->size_written))
+ return false;
- /* If the size of the use type is larger than the size of the entry
- * type, the entry doesn't contain all of the data that the user is
- * trying to use.
- */
- if (type_sz(inst->src[i].type) > type_sz(entry->dst.type))
- continue;
+ /* If the size of the use type is larger than the size of the entry
+ * type, the entry doesn't contain all of the data that the user is
+ * trying to use.
+ */
+ if (type_sz(inst->src[arg].type) > type_sz(entry->dst.type))
+ return false;
- fs_reg val = entry->src;
+ fs_reg val = entry->src;
- /* If the size of the use type is smaller than the size of the entry,
- * clamp the value to the range of the use type. This enables constant
- * copy propagation in cases like
- *
- *
- * mov(8) g12<1>UD 0x0000000cUD
- * ...
- * mul(8) g47<1>D g86<8,8,1>D g12<16,8,2>W
+ /* If the size of the use type is smaller than the size of the entry,
+ * clamp the value to the range of the use type. This enables constant
+ * copy propagation in cases like
+ *
+ *
+ * mov(8) g12<1>UD 0x0000000cUD
+ * ...
+ * mul(8) g47<1>D g86<8,8,1>D g12<16,8,2>W
+ */
+ if (type_sz(inst->src[arg].type) < type_sz(entry->dst.type)) {
+ if (type_sz(inst->src[arg].type) != 2 || type_sz(entry->dst.type) != 4)
+ return false;
+
+ assert(inst->src[arg].subnr == 0 || inst->src[arg].subnr == 2);
+
+ /* When subnr is 0, we want the lower 16-bits, and when it's 2, we
+ * want the upper 16-bits. No other values of subnr are valid for a
+ * UD source.
*/
- if (type_sz(inst->src[i].type) < type_sz(entry->dst.type)) {
- if (type_sz(inst->src[i].type) != 2 || type_sz(entry->dst.type) != 4)
- continue;
+ const uint16_t v = inst->src[arg].subnr == 2 ? val.ud >> 16 : val.ud;
- assert(inst->src[i].subnr == 0 || inst->src[i].subnr == 2);
+ val.ud = v | (uint32_t(v) << 16);
+ }
- /* When subnr is 0, we want the lower 16-bits, and when it's 2, we
- * want the upper 16-bits. No other values of subnr are valid for a
- * UD source.
- */
- const uint16_t v = inst->src[i].subnr == 2 ? val.ud >> 16 : val.ud;
+ val.type = inst->src[arg].type;
- val.ud = v | (uint32_t(v) << 16);
+ if (inst->src[arg].abs) {
+ if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) ||
+ !brw_abs_immediate(val.type, &val.as_brw_reg())) {
+ return false;
}
+ }
- val.type = inst->src[i].type;
-
- if (inst->src[i].abs) {
- if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) ||
- !brw_abs_immediate(val.type, &val.as_brw_reg())) {
- continue;
- }
+ if (inst->src[arg].negate) {
+ if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) ||
+ !brw_negate_immediate(val.type, &val.as_brw_reg())) {
+ return false;
}
+ }
- if (inst->src[i].negate) {
- if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) ||
- !brw_negate_immediate(val.type, &val.as_brw_reg())) {
- continue;
- }
+ switch (inst->opcode) {
+ case BRW_OPCODE_MOV:
+ case SHADER_OPCODE_LOAD_PAYLOAD:
+ case FS_OPCODE_PACK:
+ inst->src[arg] = val;
+ progress = true;
+ break;
+
+ case SHADER_OPCODE_POW:
+ /* Allow constant propagation into src1 (except on Gen 6 which
+ * doesn't support scalar source math), and let constant combining
+ * promote the constant on Gen < 8.
+ */
+ if (devinfo->ver == 6)
+ break;
+
+ if (arg == 1) {
+ inst->src[arg] = val;
+ progress = true;
}
+ break;
- switch (inst->opcode) {
- case BRW_OPCODE_MOV:
- case SHADER_OPCODE_LOAD_PAYLOAD:
- case FS_OPCODE_PACK:
- inst->src[i] = val;
+ case BRW_OPCODE_SUBB:
+ if (arg == 1) {
+ inst->src[arg] = val;
progress = true;
- break;
+ }
+ break;
+
+ case BRW_OPCODE_MACH:
+ case BRW_OPCODE_MUL:
+ case SHADER_OPCODE_MULH:
+ case BRW_OPCODE_ADD:
+ case BRW_OPCODE_XOR:
+ case BRW_OPCODE_ADDC:
+ if (arg == 1) {
+ inst->src[arg] = val;
+ progress = true;
+ } else if (arg == 0 && inst->src[1].file != IMM) {
+ /* Don't copy propagate the constant in situations like
+ *
+ * mov(8) g8<1>D 0x7fffffffD
+ * mul(8) g16<1>D g8<8,8,1>D g15<16,8,2>W
+ *
+ * On platforms that only have a 32x16 multiplier, this will
+ * result in lowering the multiply to
+ *
+ * mul(8) g15<1>D g14<8,8,1>D 0xffffUW
+ * mul(8) g16<1>D g14<8,8,1>D 0x7fffUW
+ * add(8) g15.1<2>UW g15.1<16,8,2>UW g16<16,8,2>UW
+ *
+ * On Gfx8 and Gfx9, which have the full 32x32 multiplier, it
+ * results in
+ *
+ * mul(8) g16<1>D g15<16,8,2>W 0x7fffffffD
+ *
+ * Volume 2a of the Skylake PRM says:
+ *
+ * When multiplying a DW and any lower precision integer, the
+ * DW operand must on src0.
+ */
+ if (inst->opcode == BRW_OPCODE_MUL &&
+ type_sz(inst->src[1].type) < 4 &&
+ type_sz(val.type) == 4)
+ break;
- case SHADER_OPCODE_POW:
- /* Allow constant propagation into src1 (except on Gen 6 which
- * doesn't support scalar source math), and let constant combining
- * promote the constant on Gen < 8.
+ /* Fit this constant in by commuting the operands.
+ * Exception: we can't do this for 32-bit integer MUL/MACH
+ * because it's asymmetric.
+ *
+ * The BSpec says for Broadwell that
+ *
+ * "When multiplying DW x DW, the dst cannot be accumulator."
+ *
+ * Integer MUL with a non-accumulator destination will be lowered
+ * by lower_integer_multiplication(), so don't restrict it.
*/
- if (devinfo->ver == 6)
+ if (((inst->opcode == BRW_OPCODE_MUL &&
+ inst->dst.is_accumulator()) ||
+ inst->opcode == BRW_OPCODE_MACH) &&
+ (inst->src[1].type == BRW_REGISTER_TYPE_D ||
+ inst->src[1].type == BRW_REGISTER_TYPE_UD))
break;
+ inst->src[0] = inst->src[1];
+ inst->src[1] = val;
+ progress = true;
+ }
+ break;
- if (i == 1) {
- inst->src[i] = val;
- progress = true;
- }
+ case BRW_OPCODE_ADD3:
+ /* add3 can have a single imm16 source. Proceed if the source type is
+ * already W or UW or the value can be coerced to one of those types.
+ */
+ if (val.type == BRW_REGISTER_TYPE_W || val.type == BRW_REGISTER_TYPE_UW)
+ ; /* Nothing to do. */
+ else if (val.ud <= 0xffff)
+ val = brw_imm_uw(val.ud);
+ else if (val.d >= -0x8000 && val.d <= 0x7fff)
+ val = brw_imm_w(val.d);
+ else
break;
- case BRW_OPCODE_SUBB:
- if (i == 1) {
- inst->src[i] = val;
- progress = true;
- }
- break;
+ if (arg == 2) {
+ inst->src[arg] = val;
+ progress = true;
+ } else if (inst->src[2].file != IMM) {
+ inst->src[arg] = inst->src[2];
+ inst->src[2] = val;
+ progress = true;
+ }
- case BRW_OPCODE_MACH:
- case BRW_OPCODE_MUL:
- case SHADER_OPCODE_MULH:
- case BRW_OPCODE_ADD:
- case BRW_OPCODE_XOR:
- case BRW_OPCODE_ADDC:
- if (i == 1) {
- inst->src[i] = val;
- progress = true;
- } else if (i == 0 && inst->src[1].file != IMM) {
- /* Don't copy propagate the constant in situations like
- *
- * mov(8) g8<1>D 0x7fffffffD
- * mul(8) g16<1>D g8<8,8,1>D g15<16,8,2>W
- *
- * On platforms that only have a 32x16 multiplier, this will
- * result in lowering the multiply to
- *
- * mul(8) g15<1>D g14<8,8,1>D 0xffffUW
- * mul(8) g16<1>D g14<8,8,1>D 0x7fffUW
- * add(8) g15.1<2>UW g15.1<16,8,2>UW g16<16,8,2>UW
- *
- * On Gfx8 and Gfx9, which have the full 32x32 multiplier, it
- * results in
- *
- * mul(8) g16<1>D g15<16,8,2>W 0x7fffffffD
- *
- * Volume 2a of the Skylake PRM says:
- *
- * When multiplying a DW and any lower precision integer, the
- * DW operand must on src0.
- */
- if (inst->opcode == BRW_OPCODE_MUL &&
- type_sz(inst->src[1].type) < 4 &&
- type_sz(val.type) == 4)
- break;
-
- /* Fit this constant in by commuting the operands.
- * Exception: we can't do this for 32-bit integer MUL/MACH
- * because it's asymmetric.
- *
- * The BSpec says for Broadwell that
- *
- * "When multiplying DW x DW, the dst cannot be accumulator."
- *
- * Integer MUL with a non-accumulator destination will be lowered
- * by lower_integer_multiplication(), so don't restrict it.
+ break;
+
+ case BRW_OPCODE_CMP:
+ case BRW_OPCODE_IF:
+ if (arg == 1) {
+ inst->src[arg] = val;
+ progress = true;
+ } else if (arg == 0 && inst->src[1].file != IMM) {
+ enum brw_conditional_mod new_cmod;
+
+ new_cmod = brw_swap_cmod(inst->conditional_mod);
+ if (new_cmod != BRW_CONDITIONAL_NONE) {
+ /* Fit this constant in by swapping the operands and
+ * flipping the test
*/
- if (((inst->opcode == BRW_OPCODE_MUL &&
- inst->dst.is_accumulator()) ||
- inst->opcode == BRW_OPCODE_MACH) &&
- (inst->src[1].type == BRW_REGISTER_TYPE_D ||
- inst->src[1].type == BRW_REGISTER_TYPE_UD))
- break;
inst->src[0] = inst->src[1];
inst->src[1] = val;
+ inst->conditional_mod = new_cmod;
progress = true;
}
- break;
-
- case BRW_OPCODE_ADD3:
- /* add3 can have a single imm16 source. Proceed if the source type is
- * already W or UW or the value can be coerced to one of those types.
- */
- if (val.type == BRW_REGISTER_TYPE_W || val.type == BRW_REGISTER_TYPE_UW)
- ; /* Nothing to do. */
- else if (val.ud <= 0xffff)
- val = brw_imm_uw(val.ud);
- else if (val.d >= -0x8000 && val.d <= 0x7fff)
- val = brw_imm_w(val.d);
- else
- break;
-
- if (i == 2) {
- inst->src[i] = val;
- progress = true;
- } else if (inst->src[2].file != IMM) {
- inst->src[i] = inst->src[2];
- inst->src[2] = val;
- progress = true;
- }
-
- break;
-
- case BRW_OPCODE_CMP:
- case BRW_OPCODE_IF:
- if (i == 1) {
- inst->src[i] = val;
- progress = true;
- } else if (i == 0 && inst->src[1].file != IMM) {
- enum brw_conditional_mod new_cmod;
+ }
+ break;
- new_cmod = brw_swap_cmod(inst->conditional_mod);
- if (new_cmod != BRW_CONDITIONAL_NONE) {
- /* Fit this constant in by swapping the operands and
- * flipping the test
- */
- inst->src[0] = inst->src[1];
- inst->src[1] = val;
- inst->conditional_mod = new_cmod;
- progress = true;
- }
- }
- break;
+ case BRW_OPCODE_SEL:
+ if (arg == 1) {
+ inst->src[arg] = val;
+ progress = true;
+ } else if (arg == 0) {
+ if (inst->src[1].file != IMM &&
+ (inst->conditional_mod == BRW_CONDITIONAL_NONE ||
+ /* Only GE and L are commutative. */
+ inst->conditional_mod == BRW_CONDITIONAL_GE ||
+ inst->conditional_mod == BRW_CONDITIONAL_L)) {
+ inst->src[0] = inst->src[1];
+ inst->src[1] = val;
- case BRW_OPCODE_SEL:
- if (i == 1) {
- inst->src[i] = val;
- progress = true;
- } else if (i == 0) {
- if (inst->src[1].file != IMM &&
- (inst->conditional_mod == BRW_CONDITIONAL_NONE ||
- /* Only GE and L are commutative. */
- inst->conditional_mod == BRW_CONDITIONAL_GE ||
- inst->conditional_mod == BRW_CONDITIONAL_L)) {
- inst->src[0] = inst->src[1];
- inst->src[1] = val;
-
- /* If this was predicated, flipping operands means
- * we also need to flip the predicate.
- */
- if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
- inst->predicate_inverse =
- !inst->predicate_inverse;
- }
- } else {
- inst->src[0] = val;
+ /* If this was predicated, flipping operands means
+ * we also need to flip the predicate.
+ */
+ if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
+ inst->predicate_inverse =
+ !inst->predicate_inverse;
}
-
- progress = true;
+ } else {
+ inst->src[0] = val;
}
- break;
-
- case FS_OPCODE_FB_WRITE_LOGICAL:
- /* The stencil and omask sources of FS_OPCODE_FB_WRITE_LOGICAL are
- * bit-cast using a strided region so they cannot be immediates.
- */
- if (i != FB_WRITE_LOGICAL_SRC_SRC_STENCIL &&
- i != FB_WRITE_LOGICAL_SRC_OMASK) {
- inst->src[i] = val;
- progress = true;
- }
- break;
- case SHADER_OPCODE_INT_QUOTIENT:
- case SHADER_OPCODE_INT_REMAINDER:
- /* Allow constant propagation into either source (except on Gen 6
- * which doesn't support scalar source math). Constant combining
- * promote the src1 constant on Gen < 8, and it will promote the src0
- * constant on all platforms.
- */
- if (devinfo->ver == 6)
- break;
-
- FALLTHROUGH;
- case BRW_OPCODE_AND:
- case BRW_OPCODE_ASR:
- case BRW_OPCODE_BFE:
- case BRW_OPCODE_BFI1:
- case BRW_OPCODE_BFI2:
- case BRW_OPCODE_ROL:
- case BRW_OPCODE_ROR:
- case BRW_OPCODE_SHL:
- case BRW_OPCODE_SHR:
- case BRW_OPCODE_OR:
- case SHADER_OPCODE_TEX_LOGICAL:
- case SHADER_OPCODE_TXD_LOGICAL:
- case SHADER_OPCODE_TXF_LOGICAL:
- case SHADER_OPCODE_TXL_LOGICAL:
- case SHADER_OPCODE_TXS_LOGICAL:
- case FS_OPCODE_TXB_LOGICAL:
- case SHADER_OPCODE_TXF_CMS_LOGICAL:
- case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
- case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
- case SHADER_OPCODE_TXF_UMS_LOGICAL:
- case SHADER_OPCODE_TXF_MCS_LOGICAL:
- case SHADER_OPCODE_LOD_LOGICAL:
- case SHADER_OPCODE_TG4_LOGICAL:
- case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
- case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
- case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
- case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
- case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
- case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
- case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
- case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
- case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
- case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
- case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
- case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
- case SHADER_OPCODE_BROADCAST:
- case BRW_OPCODE_MAD:
- case BRW_OPCODE_LRP:
- case FS_OPCODE_PACK_HALF_2x16_SPLIT:
- case SHADER_OPCODE_SHUFFLE:
- inst->src[i] = val;
progress = true;
- break;
-
- default:
- break;
}
- }
+ break;
- /* ADD3 can only have the immediate as src0. */
- if (progress && inst->opcode == BRW_OPCODE_ADD3) {
- if (inst->src[2].file == IMM) {
- const auto src0 = inst->src[0];
- inst->src[0] = inst->src[2];
- inst->src[2] = src0;
+ case FS_OPCODE_FB_WRITE_LOGICAL:
+ /* The stencil and omask sources of FS_OPCODE_FB_WRITE_LOGICAL are
+ * bit-cast using a strided region so they cannot be immediates.
+ */
+ if (arg != FB_WRITE_LOGICAL_SRC_SRC_STENCIL &&
+ arg != FB_WRITE_LOGICAL_SRC_OMASK) {
+ inst->src[arg] = val;
+ progress = true;
}
- }
+ break;
+
+ case SHADER_OPCODE_INT_QUOTIENT:
+ case SHADER_OPCODE_INT_REMAINDER:
+ /* Allow constant propagation into either source (except on Gen 6
+ * which doesn't support scalar source math). Constant combining
+ * promote the src1 constant on Gen < 8, and it will promote the src0
+ * constant on all platforms.
+ */
+ if (devinfo->ver == 6)
+ break;
- /* If only one of the sources of a 2-source, commutative instruction (e.g.,
- * AND) is immediate, it must be src1. If both are immediate, opt_algebraic
- * should fold it away.
- */
- if (progress && inst->sources == 2 && inst->is_commutative() &&
- inst->src[0].file == IMM && inst->src[1].file != IMM) {
- const auto src1 = inst->src[1];
- inst->src[1] = inst->src[0];
- inst->src[0] = src1;
+ FALLTHROUGH;
+ case BRW_OPCODE_AND:
+ case BRW_OPCODE_ASR:
+ case BRW_OPCODE_BFE:
+ case BRW_OPCODE_BFI1:
+ case BRW_OPCODE_BFI2:
+ case BRW_OPCODE_ROL:
+ case BRW_OPCODE_ROR:
+ case BRW_OPCODE_SHL:
+ case BRW_OPCODE_SHR:
+ case BRW_OPCODE_OR:
+ case SHADER_OPCODE_TEX_LOGICAL:
+ case SHADER_OPCODE_TXD_LOGICAL:
+ case SHADER_OPCODE_TXF_LOGICAL:
+ case SHADER_OPCODE_TXL_LOGICAL:
+ case SHADER_OPCODE_TXS_LOGICAL:
+ case FS_OPCODE_TXB_LOGICAL:
+ case SHADER_OPCODE_TXF_CMS_LOGICAL:
+ case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+ case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
+ case SHADER_OPCODE_TXF_UMS_LOGICAL:
+ case SHADER_OPCODE_TXF_MCS_LOGICAL:
+ case SHADER_OPCODE_LOD_LOGICAL:
+ case SHADER_OPCODE_TG4_LOGICAL:
+ case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+ case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
+ case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
+ case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+ case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+ case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+ case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+ case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+ case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
+ case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+ case SHADER_OPCODE_BROADCAST:
+ case BRW_OPCODE_MAD:
+ case BRW_OPCODE_LRP:
+ case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+ case SHADER_OPCODE_SHUFFLE:
+ inst->src[arg] = val;
+ progress = true;
+ break;
+
+ default:
+ break;
}
return progress;
foreach_inst_in_block(fs_inst, inst, block) {
/* Try propagating into this instruction. */
- for (int i = 0; i < inst->sources; i++) {
+ bool instruction_progress = false;
+ for (int i = inst->sources - 1; i >= 0; i--) {
if (inst->src[i].file != VGRF)
continue;
foreach_in_list(acp_entry, entry, &acp[inst->src[i].nr % ACP_HASH_SIZE]) {
if (entry->src.file == IMM) {
- if (try_constant_propagate(compiler, inst, entry)) {
- progress = true;
+ if (try_constant_propagate(compiler, inst, entry, i)) {
+ instruction_progress = true;
break;
}
} else {
if (try_copy_propagate(compiler, inst, entry, i, alloc)) {
- progress = true;
+ instruction_progress = true;
break;
}
}
}
}
+ if (instruction_progress) {
+ progress = true;
+
+ /* ADD3 can only have the immediate as src0. */
+ if (inst->opcode == BRW_OPCODE_ADD3) {
+ if (inst->src[2].file == IMM) {
+ const auto src0 = inst->src[0];
+ inst->src[0] = inst->src[2];
+ inst->src[2] = src0;
+ }
+ }
+
+ /* If only one of the sources of a 2-source, commutative instruction (e.g.,
+ * AND) is immediate, it must be src1. If both are immediate, opt_algebraic
+ * should fold it away.
+ */
+ if (inst->sources == 2 && inst->is_commutative() &&
+ inst->src[0].file == IMM && inst->src[1].file != IMM) {
+ const auto src1 = inst->src[1];
+ inst->src[1] = inst->src[0];
+ inst->src[0] = src1;
+ }
+ }
+
/* kill the destination from the ACP */
if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) {