intel/fs: Don't loop in try_constant_propagate
authorIan Romanick <ian.d.romanick@intel.com>
Wed, 15 Mar 2023 19:01:35 +0000 (12:01 -0700)
committerMarge Bot <emma+marge@anholt.net>
Thu, 14 Sep 2023 22:31:23 +0000 (22:31 +0000)
The caller already loops over the sources. This means that the caller
must loop over the sources in reverse because constant propagation
prefers to propagate into the last sources first.

The shader-db and fossil-db changes (below) are all due to SEL
instructions. Changing the order sources are visited changes whether a
SEL with two immediate sources is

    (+f0.0) sel     g12    IMM_A    IMM_B

or

    (-f0.0) sel     g12    IMM_B    IMM_A

The ordering of the sources affects the order the constant combining
encounters the values, and the determines which value is "combined"
and which value remains an immediate.

This affects the results by luck. If there are two instructions:

    (+f0.0) sel     g12    IMM_A    IMM_B
    (+f0.0) sel     g13    IMM_A    IMM_C

Picking IMM_A is advantageous over picking IMM_B and IMM_C. Since the
selection algorithm in constant combining is greedy, this case
requires the algorithm see the values in just the right order for the
right thing to happen.

v2: Rebase on many, many changes. Move instruction source fixup
reordering out or try_constant_propagate.

v3: Rebase on !7698.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25091>

src/intel/compiler/brw_fs_copy_propagation.cpp

index 419643a..f39107f 100644 (file)
@@ -818,7 +818,7 @@ try_copy_propagate(const brw_compiler *compiler, fs_inst *inst,
 
 static bool
 try_constant_propagate(const brw_compiler *compiler, fs_inst *inst,
-                       acp_entry *entry)
+                       acp_entry *entry, int arg)
 {
    const struct intel_device_info *devinfo = compiler->devinfo;
    bool progress = false;
@@ -826,321 +826,299 @@ try_constant_propagate(const brw_compiler *compiler, fs_inst *inst,
    if (type_sz(entry->src.type) > 4)
       return false;
 
-   for (int i = inst->sources - 1; i >= 0; i--) {
-      if (inst->src[i].file != VGRF)
-         continue;
+   if (inst->src[arg].file != VGRF)
+      return false;
 
-      assert(entry->dst.file == VGRF);
-      if (inst->src[i].nr != entry->dst.nr)
-         continue;
+   assert(entry->dst.file == VGRF);
+   if (inst->src[arg].nr != entry->dst.nr)
+      return false;
 
-      /* Bail if inst is reading a range that isn't contained in the range
-       * that entry is writing.
-       */
-      if (!region_contained_in(inst->src[i], inst->size_read(i),
-                               entry->dst, entry->size_written))
-         continue;
+   /* Bail if inst is reading a range that isn't contained in the range
+    * that entry is writing.
+    */
+   if (!region_contained_in(inst->src[arg], inst->size_read(arg),
+                            entry->dst, entry->size_written))
+      return false;
 
-      /* If the size of the use type is larger than the size of the entry
-       * type, the entry doesn't contain all of the data that the user is
-       * trying to use.
-       */
-      if (type_sz(inst->src[i].type) > type_sz(entry->dst.type))
-         continue;
+   /* If the size of the use type is larger than the size of the entry
+    * type, the entry doesn't contain all of the data that the user is
+    * trying to use.
+    */
+   if (type_sz(inst->src[arg].type) > type_sz(entry->dst.type))
+      return false;
 
-      fs_reg val = entry->src;
+   fs_reg val = entry->src;
 
-      /* If the size of the use type is smaller than the size of the entry,
-       * clamp the value to the range of the use type.  This enables constant
-       * copy propagation in cases like
-       *
-       *
-       *    mov(8)          g12<1>UD        0x0000000cUD
-       *    ...
-       *    mul(8)          g47<1>D         g86<8,8,1>D     g12<16,8,2>W
+   /* If the size of the use type is smaller than the size of the entry,
+    * clamp the value to the range of the use type.  This enables constant
+    * copy propagation in cases like
+    *
+    *
+    *    mov(8)          g12<1>UD        0x0000000cUD
+    *    ...
+    *    mul(8)          g47<1>D         g86<8,8,1>D     g12<16,8,2>W
+    */
+   if (type_sz(inst->src[arg].type) < type_sz(entry->dst.type)) {
+      if (type_sz(inst->src[arg].type) != 2 || type_sz(entry->dst.type) != 4)
+         return false;
+
+      assert(inst->src[arg].subnr == 0 || inst->src[arg].subnr == 2);
+
+      /* When subnr is 0, we want the lower 16-bits, and when it's 2, we
+       * want the upper 16-bits. No other values of subnr are valid for a
+       * UD source.
        */
-      if (type_sz(inst->src[i].type) < type_sz(entry->dst.type)) {
-         if (type_sz(inst->src[i].type) != 2 || type_sz(entry->dst.type) != 4)
-            continue;
+      const uint16_t v = inst->src[arg].subnr == 2 ? val.ud >> 16 : val.ud;
 
-         assert(inst->src[i].subnr == 0 || inst->src[i].subnr == 2);
+      val.ud = v | (uint32_t(v) << 16);
+   }
 
-         /* When subnr is 0, we want the lower 16-bits, and when it's 2, we
-          * want the upper 16-bits. No other values of subnr are valid for a
-          * UD source.
-          */
-         const uint16_t v = inst->src[i].subnr == 2 ? val.ud >> 16 : val.ud;
+   val.type = inst->src[arg].type;
 
-         val.ud = v | (uint32_t(v) << 16);
+   if (inst->src[arg].abs) {
+      if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) ||
+          !brw_abs_immediate(val.type, &val.as_brw_reg())) {
+         return false;
       }
+   }
 
-      val.type = inst->src[i].type;
-
-      if (inst->src[i].abs) {
-         if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) ||
-             !brw_abs_immediate(val.type, &val.as_brw_reg())) {
-            continue;
-         }
+   if (inst->src[arg].negate) {
+      if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) ||
+          !brw_negate_immediate(val.type, &val.as_brw_reg())) {
+         return false;
       }
+   }
 
-      if (inst->src[i].negate) {
-         if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) ||
-             !brw_negate_immediate(val.type, &val.as_brw_reg())) {
-            continue;
-         }
+   switch (inst->opcode) {
+   case BRW_OPCODE_MOV:
+   case SHADER_OPCODE_LOAD_PAYLOAD:
+   case FS_OPCODE_PACK:
+      inst->src[arg] = val;
+      progress = true;
+      break;
+
+   case SHADER_OPCODE_POW:
+      /* Allow constant propagation into src1 (except on Gen 6 which
+       * doesn't support scalar source math), and let constant combining
+       * promote the constant on Gen < 8.
+       */
+      if (devinfo->ver == 6)
+         break;
+
+      if (arg == 1) {
+         inst->src[arg] = val;
+         progress = true;
       }
+      break;
 
-      switch (inst->opcode) {
-      case BRW_OPCODE_MOV:
-      case SHADER_OPCODE_LOAD_PAYLOAD:
-      case FS_OPCODE_PACK:
-         inst->src[i] = val;
+   case BRW_OPCODE_SUBB:
+      if (arg == 1) {
+         inst->src[arg] = val;
          progress = true;
-         break;
+      }
+      break;
+
+   case BRW_OPCODE_MACH:
+   case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_XOR:
+   case BRW_OPCODE_ADDC:
+      if (arg == 1) {
+         inst->src[arg] = val;
+         progress = true;
+      } else if (arg == 0 && inst->src[1].file != IMM) {
+         /* Don't copy propagate the constant in situations like
+          *
+          *    mov(8)          g8<1>D          0x7fffffffD
+          *    mul(8)          g16<1>D         g8<8,8,1>D      g15<16,8,2>W
+          *
+          * On platforms that only have a 32x16 multiplier, this will
+          * result in lowering the multiply to
+          *
+          *    mul(8)          g15<1>D         g14<8,8,1>D     0xffffUW
+          *    mul(8)          g16<1>D         g14<8,8,1>D     0x7fffUW
+          *    add(8)          g15.1<2>UW      g15.1<16,8,2>UW g16<16,8,2>UW
+          *
+          * On Gfx8 and Gfx9, which have the full 32x32 multiplier, it
+          * results in
+          *
+          *    mul(8)          g16<1>D         g15<16,8,2>W    0x7fffffffD
+          *
+          * Volume 2a of the Skylake PRM says:
+          *
+          *    When multiplying a DW and any lower precision integer, the
+          *    DW operand must on src0.
+          */
+         if (inst->opcode == BRW_OPCODE_MUL &&
+             type_sz(inst->src[1].type) < 4 &&
+             type_sz(val.type) == 4)
+            break;
 
-      case SHADER_OPCODE_POW:
-         /* Allow constant propagation into src1 (except on Gen 6 which
-          * doesn't support scalar source math), and let constant combining
-          * promote the constant on Gen < 8.
+         /* Fit this constant in by commuting the operands.
+          * Exception: we can't do this for 32-bit integer MUL/MACH
+          * because it's asymmetric.
+          *
+          * The BSpec says for Broadwell that
+          *
+          *    "When multiplying DW x DW, the dst cannot be accumulator."
+          *
+          * Integer MUL with a non-accumulator destination will be lowered
+          * by lower_integer_multiplication(), so don't restrict it.
           */
-         if (devinfo->ver == 6)
+         if (((inst->opcode == BRW_OPCODE_MUL &&
+               inst->dst.is_accumulator()) ||
+              inst->opcode == BRW_OPCODE_MACH) &&
+             (inst->src[1].type == BRW_REGISTER_TYPE_D ||
+              inst->src[1].type == BRW_REGISTER_TYPE_UD))
             break;
+         inst->src[0] = inst->src[1];
+         inst->src[1] = val;
+         progress = true;
+      }
+      break;
 
-         if (i == 1) {
-            inst->src[i] = val;
-            progress = true;
-         }
+   case BRW_OPCODE_ADD3:
+      /* add3 can have a single imm16 source. Proceed if the source type is
+       * already W or UW or the value can be coerced to one of those types.
+       */
+      if (val.type == BRW_REGISTER_TYPE_W || val.type == BRW_REGISTER_TYPE_UW)
+         ; /* Nothing to do. */
+      else if (val.ud <= 0xffff)
+         val = brw_imm_uw(val.ud);
+      else if (val.d >= -0x8000 && val.d <= 0x7fff)
+         val = brw_imm_w(val.d);
+      else
          break;
 
-      case BRW_OPCODE_SUBB:
-         if (i == 1) {
-            inst->src[i] = val;
-            progress = true;
-         }
-         break;
+      if (arg == 2) {
+         inst->src[arg] = val;
+         progress = true;
+      } else if (inst->src[2].file != IMM) {
+         inst->src[arg] = inst->src[2];
+         inst->src[2] = val;
+         progress = true;
+      }
 
-      case BRW_OPCODE_MACH:
-      case BRW_OPCODE_MUL:
-      case SHADER_OPCODE_MULH:
-      case BRW_OPCODE_ADD:
-      case BRW_OPCODE_XOR:
-      case BRW_OPCODE_ADDC:
-         if (i == 1) {
-            inst->src[i] = val;
-            progress = true;
-         } else if (i == 0 && inst->src[1].file != IMM) {
-            /* Don't copy propagate the constant in situations like
-             *
-             *    mov(8)          g8<1>D          0x7fffffffD
-             *    mul(8)          g16<1>D         g8<8,8,1>D      g15<16,8,2>W
-             *
-             * On platforms that only have a 32x16 multiplier, this will
-             * result in lowering the multiply to
-             *
-             *    mul(8)          g15<1>D         g14<8,8,1>D     0xffffUW
-             *    mul(8)          g16<1>D         g14<8,8,1>D     0x7fffUW
-             *    add(8)          g15.1<2>UW      g15.1<16,8,2>UW g16<16,8,2>UW
-             *
-             * On Gfx8 and Gfx9, which have the full 32x32 multiplier, it
-             * results in
-             *
-             *    mul(8)          g16<1>D         g15<16,8,2>W    0x7fffffffD
-             *
-             * Volume 2a of the Skylake PRM says:
-             *
-             *    When multiplying a DW and any lower precision integer, the
-             *    DW operand must on src0.
-             */
-            if (inst->opcode == BRW_OPCODE_MUL &&
-                type_sz(inst->src[1].type) < 4 &&
-                type_sz(val.type) == 4)
-               break;
-
-            /* Fit this constant in by commuting the operands.
-             * Exception: we can't do this for 32-bit integer MUL/MACH
-             * because it's asymmetric.
-             *
-             * The BSpec says for Broadwell that
-             *
-             *    "When multiplying DW x DW, the dst cannot be accumulator."
-             *
-             * Integer MUL with a non-accumulator destination will be lowered
-             * by lower_integer_multiplication(), so don't restrict it.
+      break;
+
+   case BRW_OPCODE_CMP:
+   case BRW_OPCODE_IF:
+      if (arg == 1) {
+         inst->src[arg] = val;
+         progress = true;
+      } else if (arg == 0 && inst->src[1].file != IMM) {
+         enum brw_conditional_mod new_cmod;
+
+         new_cmod = brw_swap_cmod(inst->conditional_mod);
+         if (new_cmod != BRW_CONDITIONAL_NONE) {
+            /* Fit this constant in by swapping the operands and
+             * flipping the test
              */
-            if (((inst->opcode == BRW_OPCODE_MUL &&
-                  inst->dst.is_accumulator()) ||
-                 inst->opcode == BRW_OPCODE_MACH) &&
-                (inst->src[1].type == BRW_REGISTER_TYPE_D ||
-                 inst->src[1].type == BRW_REGISTER_TYPE_UD))
-               break;
             inst->src[0] = inst->src[1];
             inst->src[1] = val;
+            inst->conditional_mod = new_cmod;
             progress = true;
          }
-         break;
-
-      case BRW_OPCODE_ADD3:
-         /* add3 can have a single imm16 source. Proceed if the source type is
-          * already W or UW or the value can be coerced to one of those types.
-          */
-         if (val.type == BRW_REGISTER_TYPE_W || val.type == BRW_REGISTER_TYPE_UW)
-            ; /* Nothing to do. */
-         else if (val.ud <= 0xffff)
-            val = brw_imm_uw(val.ud);
-         else if (val.d >= -0x8000 && val.d <= 0x7fff)
-            val = brw_imm_w(val.d);
-         else
-            break;
-
-         if (i == 2) {
-            inst->src[i] = val;
-            progress = true;
-         } else if (inst->src[2].file != IMM) {
-            inst->src[i] = inst->src[2];
-            inst->src[2] = val;
-            progress = true;
-         }
-
-         break;
-
-      case BRW_OPCODE_CMP:
-      case BRW_OPCODE_IF:
-         if (i == 1) {
-            inst->src[i] = val;
-            progress = true;
-         } else if (i == 0 && inst->src[1].file != IMM) {
-            enum brw_conditional_mod new_cmod;
+      }
+      break;
 
-            new_cmod = brw_swap_cmod(inst->conditional_mod);
-            if (new_cmod != BRW_CONDITIONAL_NONE) {
-               /* Fit this constant in by swapping the operands and
-                * flipping the test
-                */
-               inst->src[0] = inst->src[1];
-               inst->src[1] = val;
-               inst->conditional_mod = new_cmod;
-               progress = true;
-            }
-         }
-         break;
+   case BRW_OPCODE_SEL:
+      if (arg == 1) {
+         inst->src[arg] = val;
+         progress = true;
+      } else if (arg == 0) {
+         if (inst->src[1].file != IMM &&
+             (inst->conditional_mod == BRW_CONDITIONAL_NONE ||
+              /* Only GE and L are commutative. */
+              inst->conditional_mod == BRW_CONDITIONAL_GE ||
+              inst->conditional_mod == BRW_CONDITIONAL_L)) {
+            inst->src[0] = inst->src[1];
+            inst->src[1] = val;
 
-      case BRW_OPCODE_SEL:
-         if (i == 1) {
-            inst->src[i] = val;
-            progress = true;
-         } else if (i == 0) {
-            if (inst->src[1].file != IMM &&
-                (inst->conditional_mod == BRW_CONDITIONAL_NONE ||
-                 /* Only GE and L are commutative. */
-                 inst->conditional_mod == BRW_CONDITIONAL_GE ||
-                 inst->conditional_mod == BRW_CONDITIONAL_L)) {
-               inst->src[0] = inst->src[1];
-               inst->src[1] = val;
-
-               /* If this was predicated, flipping operands means
-                * we also need to flip the predicate.
-                */
-               if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
-                  inst->predicate_inverse =
-                     !inst->predicate_inverse;
-               }
-            } else {
-               inst->src[0] = val;
+            /* If this was predicated, flipping operands means
+             * we also need to flip the predicate.
+             */
+            if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
+               inst->predicate_inverse =
+                  !inst->predicate_inverse;
             }
-
-            progress = true;
+         } else {
+            inst->src[0] = val;
          }
-         break;
-
-      case FS_OPCODE_FB_WRITE_LOGICAL:
-         /* The stencil and omask sources of FS_OPCODE_FB_WRITE_LOGICAL are
-          * bit-cast using a strided region so they cannot be immediates.
-          */
-         if (i != FB_WRITE_LOGICAL_SRC_SRC_STENCIL &&
-             i != FB_WRITE_LOGICAL_SRC_OMASK) {
-            inst->src[i] = val;
-            progress = true;
-         }
-         break;
 
-      case SHADER_OPCODE_INT_QUOTIENT:
-      case SHADER_OPCODE_INT_REMAINDER:
-         /* Allow constant propagation into either source (except on Gen 6
-          * which doesn't support scalar source math). Constant combining
-          * promote the src1 constant on Gen < 8, and it will promote the src0
-          * constant on all platforms.
-          */
-         if (devinfo->ver == 6)
-            break;
-
-         FALLTHROUGH;
-      case BRW_OPCODE_AND:
-      case BRW_OPCODE_ASR:
-      case BRW_OPCODE_BFE:
-      case BRW_OPCODE_BFI1:
-      case BRW_OPCODE_BFI2:
-      case BRW_OPCODE_ROL:
-      case BRW_OPCODE_ROR:
-      case BRW_OPCODE_SHL:
-      case BRW_OPCODE_SHR:
-      case BRW_OPCODE_OR:
-      case SHADER_OPCODE_TEX_LOGICAL:
-      case SHADER_OPCODE_TXD_LOGICAL:
-      case SHADER_OPCODE_TXF_LOGICAL:
-      case SHADER_OPCODE_TXL_LOGICAL:
-      case SHADER_OPCODE_TXS_LOGICAL:
-      case FS_OPCODE_TXB_LOGICAL:
-      case SHADER_OPCODE_TXF_CMS_LOGICAL:
-      case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
-      case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
-      case SHADER_OPCODE_TXF_UMS_LOGICAL:
-      case SHADER_OPCODE_TXF_MCS_LOGICAL:
-      case SHADER_OPCODE_LOD_LOGICAL:
-      case SHADER_OPCODE_TG4_LOGICAL:
-      case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
-      case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
-      case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
-      case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
-      case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
-      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
-      case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
-      case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
-      case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
-      case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
-      case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
-      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
-      case SHADER_OPCODE_BROADCAST:
-      case BRW_OPCODE_MAD:
-      case BRW_OPCODE_LRP:
-      case FS_OPCODE_PACK_HALF_2x16_SPLIT:
-      case SHADER_OPCODE_SHUFFLE:
-         inst->src[i] = val;
          progress = true;
-         break;
-
-      default:
-         break;
       }
-   }
+      break;
 
-   /* ADD3 can only have the immediate as src0. */
-   if (progress && inst->opcode == BRW_OPCODE_ADD3) {
-      if (inst->src[2].file == IMM) {
-         const auto src0 = inst->src[0];
-         inst->src[0] = inst->src[2];
-         inst->src[2] = src0;
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+      /* The stencil and omask sources of FS_OPCODE_FB_WRITE_LOGICAL are
+       * bit-cast using a strided region so they cannot be immediates.
+       */
+      if (arg != FB_WRITE_LOGICAL_SRC_SRC_STENCIL &&
+          arg != FB_WRITE_LOGICAL_SRC_OMASK) {
+         inst->src[arg] = val;
+         progress = true;
       }
-   }
+      break;
+
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+      /* Allow constant propagation into either source (except on Gen 6
+       * which doesn't support scalar source math). Constant combining
+       * promote the src1 constant on Gen < 8, and it will promote the src0
+       * constant on all platforms.
+       */
+      if (devinfo->ver == 6)
+         break;
 
-   /* If only one of the sources of a 2-source, commutative instruction (e.g.,
-    * AND) is immediate, it must be src1. If both are immediate, opt_algebraic
-    * should fold it away.
-    */
-   if (progress && inst->sources == 2 && inst->is_commutative() &&
-       inst->src[0].file == IMM && inst->src[1].file != IMM) {
-      const auto src1 = inst->src[1];
-      inst->src[1] = inst->src[0];
-      inst->src[0] = src1;
+      FALLTHROUGH;
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_BFE:
+   case BRW_OPCODE_BFI1:
+   case BRW_OPCODE_BFI2:
+   case BRW_OPCODE_ROL:
+   case BRW_OPCODE_ROR:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_SHR:
+   case BRW_OPCODE_OR:
+   case SHADER_OPCODE_TEX_LOGICAL:
+   case SHADER_OPCODE_TXD_LOGICAL:
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+   case SHADER_OPCODE_LOD_LOGICAL:
+   case SHADER_OPCODE_TG4_LOGICAL:
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
+   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
+   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+   case SHADER_OPCODE_BROADCAST:
+   case BRW_OPCODE_MAD:
+   case BRW_OPCODE_LRP:
+   case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+   case SHADER_OPCODE_SHUFFLE:
+      inst->src[arg] = val;
+      progress = true;
+      break;
+
+   default:
+      break;
    }
 
    return progress;
@@ -1178,25 +1156,50 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
 
    foreach_inst_in_block(fs_inst, inst, block) {
       /* Try propagating into this instruction. */
-      for (int i = 0; i < inst->sources; i++) {
+      bool instruction_progress = false;
+      for (int i = inst->sources - 1; i >= 0; i--) {
          if (inst->src[i].file != VGRF)
             continue;
 
          foreach_in_list(acp_entry, entry, &acp[inst->src[i].nr % ACP_HASH_SIZE]) {
             if (entry->src.file == IMM) {
-               if (try_constant_propagate(compiler, inst, entry)) {
-                  progress = true;
+               if (try_constant_propagate(compiler, inst, entry, i)) {
+                  instruction_progress = true;
                   break;
                }
             } else {
                if (try_copy_propagate(compiler, inst, entry, i, alloc)) {
-                  progress = true;
+                  instruction_progress = true;
                   break;
                }
             }
          }
       }
 
+      if (instruction_progress) {
+         progress = true;
+
+         /* ADD3 can only have the immediate as src0. */
+         if (inst->opcode == BRW_OPCODE_ADD3) {
+            if (inst->src[2].file == IMM) {
+               const auto src0 = inst->src[0];
+               inst->src[0] = inst->src[2];
+               inst->src[2] = src0;
+            }
+         }
+
+         /* If only one of the sources of a 2-source, commutative instruction (e.g.,
+          * AND) is immediate, it must be src1. If both are immediate, opt_algebraic
+          * should fold it away.
+          */
+         if (inst->sources == 2 && inst->is_commutative() &&
+             inst->src[0].file == IMM && inst->src[1].file != IMM) {
+            const auto src1 = inst->src[1];
+            inst->src[1] = inst->src[0];
+            inst->src[0] = src1;
+         }
+      }
+
       /* kill the destination from the ACP */
       if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
          foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) {