intel/fs: Don't loop in try_constant_propagate

author Ian Romanick <ian.d.romanick@intel.com>

Wed, 15 Mar 2023 19:01:35 +0000 (12:01 -0700)

committer Marge Bot <emma+marge@anholt.net>

Thu, 14 Sep 2023 22:31:23 +0000 (22:31 +0000)
author Ian Romanick <ian.d.romanick@intel.com>
Wed, 15 Mar 2023 19:01:35 +0000 (12:01 -0700)
committer Marge Bot <emma+marge@anholt.net>
Thu, 14 Sep 2023 22:31:23 +0000 (22:31 +0000)
diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp

index 419643a..f39107f 100644 (file)
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -818,7 +818,7 @@ try_copy_propagate(const brw_compiler *compiler, fs_inst *inst,
  
  static bool
  try_constant_propagate(const brw_compiler *compiler, fs_inst *inst,
-                       acp_entry *entry)
+                       acp_entry *entry, int arg)
  {
     const struct intel_device_info *devinfo = compiler->devinfo;
     bool progress = false;
@@ -826,321 +826,299 @@ try_constant_propagate(const brw_compiler *compiler, fs_inst *inst,
     if (type_sz(entry->src.type) > 4)
        return false;
  
-   for (int i = inst->sources - 1; i >= 0; i--) {
-      if (inst->src[i].file != VGRF)
-         continue;
+   if (inst->src[arg].file != VGRF)
+      return false;
  
-      assert(entry->dst.file == VGRF);
-      if (inst->src[i].nr != entry->dst.nr)
-         continue;
+   assert(entry->dst.file == VGRF);
+   if (inst->src[arg].nr != entry->dst.nr)
+      return false;
  
-      /* Bail if inst is reading a range that isn't contained in the range
-       * that entry is writing.
-       */
-      if (!region_contained_in(inst->src[i], inst->size_read(i),
-                               entry->dst, entry->size_written))
-         continue;
+   /* Bail if inst is reading a range that isn't contained in the range
+    * that entry is writing.
+    */
+   if (!region_contained_in(inst->src[arg], inst->size_read(arg),
+                            entry->dst, entry->size_written))
+      return false;
  
-      /* If the size of the use type is larger than the size of the entry
-       * type, the entry doesn't contain all of the data that the user is
-       * trying to use.
-       */
-      if (type_sz(inst->src[i].type) > type_sz(entry->dst.type))
-         continue;
+   /* If the size of the use type is larger than the size of the entry
+    * type, the entry doesn't contain all of the data that the user is
+    * trying to use.
+    */
+   if (type_sz(inst->src[arg].type) > type_sz(entry->dst.type))
+      return false;
  
-      fs_reg val = entry->src;
+   fs_reg val = entry->src;
  
-      /* If the size of the use type is smaller than the size of the entry,
-       * clamp the value to the range of the use type.  This enables constant
-       * copy propagation in cases like
-       *
-       *
-       *    mov(8)          g12<1>UD        0x0000000cUD
-       *    ...
-       *    mul(8)          g47<1>D         g86<8,8,1>D     g12<16,8,2>W
+   /* If the size of the use type is smaller than the size of the entry,
+    * clamp the value to the range of the use type.  This enables constant
+    * copy propagation in cases like
+    *
+    *
+    *    mov(8)          g12<1>UD        0x0000000cUD
+    *    ...
+    *    mul(8)          g47<1>D         g86<8,8,1>D     g12<16,8,2>W
+    */
+   if (type_sz(inst->src[arg].type) < type_sz(entry->dst.type)) {
+      if (type_sz(inst->src[arg].type) != 2 || type_sz(entry->dst.type) != 4)
+         return false;
+
+      assert(inst->src[arg].subnr == 0 || inst->src[arg].subnr == 2);
+
+      /* When subnr is 0, we want the lower 16-bits, and when it's 2, we
+       * want the upper 16-bits. No other values of subnr are valid for a
+       * UD source.
         */
-      if (type_sz(inst->src[i].type) < type_sz(entry->dst.type)) {
-         if (type_sz(inst->src[i].type) != 2 || type_sz(entry->dst.type) != 4)
-            continue;
+      const uint16_t v = inst->src[arg].subnr == 2 ? val.ud >> 16 : val.ud;
  
-         assert(inst->src[i].subnr == 0 || inst->src[i].subnr == 2);
+      val.ud = v | (uint32_t(v) << 16);
+   }
  
-         /* When subnr is 0, we want the lower 16-bits, and when it's 2, we
-          * want the upper 16-bits. No other values of subnr are valid for a
-          * UD source.
-          */
-         const uint16_t v = inst->src[i].subnr == 2 ? val.ud >> 16 : val.ud;
+   val.type = inst->src[arg].type;
  
-         val.ud = v | (uint32_t(v) << 16);
+   if (inst->src[arg].abs) {
+      if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) ||
+          !brw_abs_immediate(val.type, &val.as_brw_reg())) {
+         return false;
        }
+   }
  
-      val.type = inst->src[i].type;
-
-      if (inst->src[i].abs) {
-         if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) ||
-             !brw_abs_immediate(val.type, &val.as_brw_reg())) {
-            continue;
-         }
+   if (inst->src[arg].negate) {
+      if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) ||
+          !brw_negate_immediate(val.type, &val.as_brw_reg())) {
+         return false;
        }
+   }
  
-      if (inst->src[i].negate) {
-         if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) ||
-             !brw_negate_immediate(val.type, &val.as_brw_reg())) {
-            continue;
-         }
+   switch (inst->opcode) {
+   case BRW_OPCODE_MOV:
+   case SHADER_OPCODE_LOAD_PAYLOAD:
+   case FS_OPCODE_PACK:
+      inst->src[arg] = val;
+      progress = true;
+      break;
+
+   case SHADER_OPCODE_POW:
+      /* Allow constant propagation into src1 (except on Gen 6 which
+       * doesn't support scalar source math), and let constant combining
+       * promote the constant on Gen < 8.
+       */
+      if (devinfo->ver == 6)
+         break;
+
+      if (arg == 1) {
+         inst->src[arg] = val;
+         progress = true;
        }
+      break;
  
-      switch (inst->opcode) {
-      case BRW_OPCODE_MOV:
-      case SHADER_OPCODE_LOAD_PAYLOAD:
-      case FS_OPCODE_PACK:
-         inst->src[i] = val;
+   case BRW_OPCODE_SUBB:
+      if (arg == 1) {
+         inst->src[arg] = val;
           progress = true;
-         break;
+      }
+      break;
+
+   case BRW_OPCODE_MACH:
+   case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_XOR:
+   case BRW_OPCODE_ADDC:
+      if (arg == 1) {
+         inst->src[arg] = val;
+         progress = true;
+      } else if (arg == 0 && inst->src[1].file != IMM) {
+         /* Don't copy propagate the constant in situations like
+          *
+          *    mov(8)          g8<1>D          0x7fffffffD
+          *    mul(8)          g16<1>D         g8<8,8,1>D      g15<16,8,2>W
+          *
+          * On platforms that only have a 32x16 multiplier, this will
+          * result in lowering the multiply to
+          *
+          *    mul(8)          g15<1>D         g14<8,8,1>D     0xffffUW
+          *    mul(8)          g16<1>D         g14<8,8,1>D     0x7fffUW
+          *    add(8)          g15.1<2>UW      g15.1<16,8,2>UW g16<16,8,2>UW
+          *
+          * On Gfx8 and Gfx9, which have the full 32x32 multiplier, it
+          * results in
+          *
+          *    mul(8)          g16<1>D         g15<16,8,2>W    0x7fffffffD
+          *
+          * Volume 2a of the Skylake PRM says:
+          *
+          *    When multiplying a DW and any lower precision integer, the
+          *    DW operand must on src0.
+          */
+         if (inst->opcode == BRW_OPCODE_MUL &&
+             type_sz(inst->src[1].type) < 4 &&
+             type_sz(val.type) == 4)
+            break;
  
-      case SHADER_OPCODE_POW:
-         /* Allow constant propagation into src1 (except on Gen 6 which
-          * doesn't support scalar source math), and let constant combining
-          * promote the constant on Gen < 8.
+         /* Fit this constant in by commuting the operands.
+          * Exception: we can't do this for 32-bit integer MUL/MACH
+          * because it's asymmetric.
+          *
+          * The BSpec says for Broadwell that
+          *
+          *    "When multiplying DW x DW, the dst cannot be accumulator."
+          *
+          * Integer MUL with a non-accumulator destination will be lowered
+          * by lower_integer_multiplication(), so don't restrict it.
            */
-         if (devinfo->ver == 6)
+         if (((inst->opcode == BRW_OPCODE_MUL &&
+               inst->dst.is_accumulator()) ||
+              inst->opcode == BRW_OPCODE_MACH) &&
+             (inst->src[1].type == BRW_REGISTER_TYPE_D ||
+              inst->src[1].type == BRW_REGISTER_TYPE_UD))
              break;
+         inst->src[0] = inst->src[1];
+         inst->src[1] = val;
+         progress = true;
+      }
+      break;
  
-         if (i == 1) {
-            inst->src[i] = val;
-            progress = true;
-         }
+   case BRW_OPCODE_ADD3:
+      /* add3 can have a single imm16 source. Proceed if the source type is
+       * already W or UW or the value can be coerced to one of those types.
+       */
+      if (val.type == BRW_REGISTER_TYPE_W || val.type == BRW_REGISTER_TYPE_UW)
+         ; /* Nothing to do. */
+      else if (val.ud <= 0xffff)
+         val = brw_imm_uw(val.ud);
+      else if (val.d >= -0x8000 && val.d <= 0x7fff)
+         val = brw_imm_w(val.d);
+      else
           break;
  
-      case BRW_OPCODE_SUBB:
-         if (i == 1) {
-            inst->src[i] = val;
-            progress = true;
-         }
-         break;
+      if (arg == 2) {
+         inst->src[arg] = val;
+         progress = true;
+      } else if (inst->src[2].file != IMM) {
+         inst->src[arg] = inst->src[2];
+         inst->src[2] = val;
+         progress = true;
+      }
  
-      case BRW_OPCODE_MACH:
-      case BRW_OPCODE_MUL:
-      case SHADER_OPCODE_MULH:
-      case BRW_OPCODE_ADD:
-      case BRW_OPCODE_XOR:
-      case BRW_OPCODE_ADDC:
-         if (i == 1) {
-            inst->src[i] = val;
-            progress = true;
-         } else if (i == 0 && inst->src[1].file != IMM) {
-            /* Don't copy propagate the constant in situations like
-             *
-             *    mov(8)          g8<1>D          0x7fffffffD
-             *    mul(8)          g16<1>D         g8<8,8,1>D      g15<16,8,2>W
-             *
-             * On platforms that only have a 32x16 multiplier, this will
-             * result in lowering the multiply to
-             *
-             *    mul(8)          g15<1>D         g14<8,8,1>D     0xffffUW
-             *    mul(8)          g16<1>D         g14<8,8,1>D     0x7fffUW
-             *    add(8)          g15.1<2>UW      g15.1<16,8,2>UW g16<16,8,2>UW
-             *
-             * On Gfx8 and Gfx9, which have the full 32x32 multiplier, it
-             * results in
-             *
-             *    mul(8)          g16<1>D         g15<16,8,2>W    0x7fffffffD
-             *
-             * Volume 2a of the Skylake PRM says:
-             *
-             *    When multiplying a DW and any lower precision integer, the
-             *    DW operand must on src0.
-             */
-            if (inst->opcode == BRW_OPCODE_MUL &&
-                type_sz(inst->src[1].type) < 4 &&
-                type_sz(val.type) == 4)
-               break;
-
-            /* Fit this constant in by commuting the operands.
-             * Exception: we can't do this for 32-bit integer MUL/MACH
-             * because it's asymmetric.
-             *
-             * The BSpec says for Broadwell that
-             *
-             *    "When multiplying DW x DW, the dst cannot be accumulator."
-             *
-             * Integer MUL with a non-accumulator destination will be lowered
-             * by lower_integer_multiplication(), so don't restrict it.
+      break;
+
+   case BRW_OPCODE_CMP:
+   case BRW_OPCODE_IF:
+      if (arg == 1) {
+         inst->src[arg] = val;
+         progress = true;
+      } else if (arg == 0 && inst->src[1].file != IMM) {
+         enum brw_conditional_mod new_cmod;
+
+         new_cmod = brw_swap_cmod(inst->conditional_mod);
+         if (new_cmod != BRW_CONDITIONAL_NONE) {
+            /* Fit this constant in by swapping the operands and
+             * flipping the test
               */
-            if (((inst->opcode == BRW_OPCODE_MUL &&
-                  inst->dst.is_accumulator()) ||
-                 inst->opcode == BRW_OPCODE_MACH) &&
-                (inst->src[1].type == BRW_REGISTER_TYPE_D ||
-                 inst->src[1].type == BRW_REGISTER_TYPE_UD))
-               break;
              inst->src[0] = inst->src[1];
              inst->src[1] = val;
+            inst->conditional_mod = new_cmod;
              progress = true;
           }
-         break;
-
-      case BRW_OPCODE_ADD3:
-         /* add3 can have a single imm16 source. Proceed if the source type is
-          * already W or UW or the value can be coerced to one of those types.
-          */
-         if (val.type == BRW_REGISTER_TYPE_W || val.type == BRW_REGISTER_TYPE_UW)
-            ; /* Nothing to do. */
-         else if (val.ud <= 0xffff)
-            val = brw_imm_uw(val.ud);
-         else if (val.d >= -0x8000 && val.d <= 0x7fff)
-            val = brw_imm_w(val.d);
-         else
-            break;
-
-         if (i == 2) {
-            inst->src[i] = val;
-            progress = true;
-         } else if (inst->src[2].file != IMM) {
-            inst->src[i] = inst->src[2];
-            inst->src[2] = val;
-            progress = true;
-         }
-
-         break;
-
-      case BRW_OPCODE_CMP:
-      case BRW_OPCODE_IF:
-         if (i == 1) {
-            inst->src[i] = val;
-            progress = true;
-         } else if (i == 0 && inst->src[1].file != IMM) {
-            enum brw_conditional_mod new_cmod;
+      }
+      break;
  
-            new_cmod = brw_swap_cmod(inst->conditional_mod);
-            if (new_cmod != BRW_CONDITIONAL_NONE) {
-               /* Fit this constant in by swapping the operands and
-                * flipping the test
-                */
-               inst->src[0] = inst->src[1];
-               inst->src[1] = val;
-               inst->conditional_mod = new_cmod;
-               progress = true;
-            }
-         }
-         break;
+   case BRW_OPCODE_SEL:
+      if (arg == 1) {
+         inst->src[arg] = val;
+         progress = true;
+      } else if (arg == 0) {
+         if (inst->src[1].file != IMM &&
+             (inst->conditional_mod == BRW_CONDITIONAL_NONE ||
+              /* Only GE and L are commutative. */
+              inst->conditional_mod == BRW_CONDITIONAL_GE ||
+              inst->conditional_mod == BRW_CONDITIONAL_L)) {
+            inst->src[0] = inst->src[1];
+            inst->src[1] = val;
  
-      case BRW_OPCODE_SEL:
-         if (i == 1) {
-            inst->src[i] = val;
-            progress = true;
-         } else if (i == 0) {
-            if (inst->src[1].file != IMM &&
-                (inst->conditional_mod == BRW_CONDITIONAL_NONE ||
-                 /* Only GE and L are commutative. */
-                 inst->conditional_mod == BRW_CONDITIONAL_GE ||
-                 inst->conditional_mod == BRW_CONDITIONAL_L)) {
-               inst->src[0] = inst->src[1];
-               inst->src[1] = val;
-
-               /* If this was predicated, flipping operands means
-                * we also need to flip the predicate.
-                */
-               if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
-                  inst->predicate_inverse =
-                     !inst->predicate_inverse;
-               }
-            } else {
-               inst->src[0] = val;
+            /* If this was predicated, flipping operands means
+             * we also need to flip the predicate.
+             */
+            if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
+               inst->predicate_inverse =
+                  !inst->predicate_inverse;
              }
-
-            progress = true;
+         } else {
+            inst->src[0] = val;
           }
-         break;
-
-      case FS_OPCODE_FB_WRITE_LOGICAL:
-         /* The stencil and omask sources of FS_OPCODE_FB_WRITE_LOGICAL are
-          * bit-cast using a strided region so they cannot be immediates.
-          */
-         if (i != FB_WRITE_LOGICAL_SRC_SRC_STENCIL &&
-             i != FB_WRITE_LOGICAL_SRC_OMASK) {
-            inst->src[i] = val;
-            progress = true;
-         }
-         break;
  
-      case SHADER_OPCODE_INT_QUOTIENT:
-      case SHADER_OPCODE_INT_REMAINDER:
-         /* Allow constant propagation into either source (except on Gen 6
-          * which doesn't support scalar source math). Constant combining
-          * promote the src1 constant on Gen < 8, and it will promote the src0
-          * constant on all platforms.
-          */
-         if (devinfo->ver == 6)
-            break;
-
-         FALLTHROUGH;
-      case BRW_OPCODE_AND:
-      case BRW_OPCODE_ASR:
-      case BRW_OPCODE_BFE:
-      case BRW_OPCODE_BFI1:
-      case BRW_OPCODE_BFI2:
-      case BRW_OPCODE_ROL:
-      case BRW_OPCODE_ROR:
-      case BRW_OPCODE_SHL:
-      case BRW_OPCODE_SHR:
-      case BRW_OPCODE_OR:
-      case SHADER_OPCODE_TEX_LOGICAL:
-      case SHADER_OPCODE_TXD_LOGICAL:
-      case SHADER_OPCODE_TXF_LOGICAL:
-      case SHADER_OPCODE_TXL_LOGICAL:
-      case SHADER_OPCODE_TXS_LOGICAL:
-      case FS_OPCODE_TXB_LOGICAL:
-      case SHADER_OPCODE_TXF_CMS_LOGICAL:
-      case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
-      case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
-      case SHADER_OPCODE_TXF_UMS_LOGICAL:
-      case SHADER_OPCODE_TXF_MCS_LOGICAL:
-      case SHADER_OPCODE_LOD_LOGICAL:
-      case SHADER_OPCODE_TG4_LOGICAL:
-      case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
-      case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
-      case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
-      case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
-      case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
-      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
-      case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
-      case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
-      case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
-      case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
-      case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
-      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
-      case SHADER_OPCODE_BROADCAST:
-      case BRW_OPCODE_MAD:
-      case BRW_OPCODE_LRP:
-      case FS_OPCODE_PACK_HALF_2x16_SPLIT:
-      case SHADER_OPCODE_SHUFFLE:
-         inst->src[i] = val;
           progress = true;
-         break;
-
-      default:
-         break;
        }
-   }
+      break;
  
-   /* ADD3 can only have the immediate as src0. */
-   if (progress && inst->opcode == BRW_OPCODE_ADD3) {
-      if (inst->src[2].file == IMM) {
-         const auto src0 = inst->src[0];
-         inst->src[0] = inst->src[2];
-         inst->src[2] = src0;
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+      /* The stencil and omask sources of FS_OPCODE_FB_WRITE_LOGICAL are
+       * bit-cast using a strided region so they cannot be immediates.
+       */
+      if (arg != FB_WRITE_LOGICAL_SRC_SRC_STENCIL &&
+          arg != FB_WRITE_LOGICAL_SRC_OMASK) {
+         inst->src[arg] = val;
+         progress = true;
        }
-   }
+      break;
+
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+      /* Allow constant propagation into either source (except on Gen 6
+       * which doesn't support scalar source math). Constant combining
+       * promote the src1 constant on Gen < 8, and it will promote the src0
+       * constant on all platforms.
+       */
+      if (devinfo->ver == 6)
+         break;
  
-   /* If only one of the sources of a 2-source, commutative instruction (e.g.,
-    * AND) is immediate, it must be src1. If both are immediate, opt_algebraic
-    * should fold it away.
-    */
-   if (progress && inst->sources == 2 && inst->is_commutative() &&
-       inst->src[0].file == IMM && inst->src[1].file != IMM) {
-      const auto src1 = inst->src[1];
-      inst->src[1] = inst->src[0];
-      inst->src[0] = src1;
+      FALLTHROUGH;
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_BFE:
+   case BRW_OPCODE_BFI1:
+   case BRW_OPCODE_BFI2:
+   case BRW_OPCODE_ROL:
+   case BRW_OPCODE_ROR:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_SHR:
+   case BRW_OPCODE_OR:
+   case SHADER_OPCODE_TEX_LOGICAL:
+   case SHADER_OPCODE_TXD_LOGICAL:
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+   case SHADER_OPCODE_LOD_LOGICAL:
+   case SHADER_OPCODE_TG4_LOGICAL:
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
+   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
+   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+   case SHADER_OPCODE_BROADCAST:
+   case BRW_OPCODE_MAD:
+   case BRW_OPCODE_LRP:
+   case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+   case SHADER_OPCODE_SHUFFLE:
+      inst->src[arg] = val;
+      progress = true;
+      break;
+
+   default:
+      break;
     }
  
     return progress;
@@ -1178,25 +1156,50 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
  
     foreach_inst_in_block(fs_inst, inst, block) {
        /* Try propagating into this instruction. */
-      for (int i = 0; i < inst->sources; i++) {
+      bool instruction_progress = false;
+      for (int i = inst->sources - 1; i >= 0; i--) {
           if (inst->src[i].file != VGRF)
              continue;
  
           foreach_in_list(acp_entry, entry, &acp[inst->src[i].nr % ACP_HASH_SIZE]) {
              if (entry->src.file == IMM) {
-               if (try_constant_propagate(compiler, inst, entry)) {
-                  progress = true;
+               if (try_constant_propagate(compiler, inst, entry, i)) {
+                  instruction_progress = true;
                    break;
                 }
              } else {
                 if (try_copy_propagate(compiler, inst, entry, i, alloc)) {
-                  progress = true;
+                  instruction_progress = true;
                    break;
                 }
              }
           }
        }
  
+      if (instruction_progress) {
+         progress = true;
+
+         /* ADD3 can only have the immediate as src0. */
+         if (inst->opcode == BRW_OPCODE_ADD3) {
+            if (inst->src[2].file == IMM) {
+               const auto src0 = inst->src[0];
+               inst->src[0] = inst->src[2];
+               inst->src[2] = src0;
+            }
+         }
+
+         /* If only one of the sources of a 2-source, commutative instruction (e.g.,
+          * AND) is immediate, it must be src1. If both are immediate, opt_algebraic
+          * should fold it away.
+          */
+         if (inst->sources == 2 && inst->is_commutative() &&
+             inst->src[0].file == IMM && inst->src[1].file != IMM) {
+            const auto src1 = inst->src[1];
+            inst->src[1] = inst->src[0];
+            inst->src[0] = src1;
+         }
+      }
+
        /* kill the destination from the ACP */
        if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
           foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) {
author	Ian Romanick <ian.d.romanick@intel.com>
	Wed, 15 Mar 2023 19:01:35 +0000 (12:01 -0700)
committer	Marge Bot <emma+marge@anholt.net>
	Thu, 14 Sep 2023 22:31:23 +0000 (22:31 +0000)