This patch allows SimplifyDemandedBits to call SimplifyMultipleUseDemandedBits in cases where the ISD::SRL source operand has other uses, enabling us to peek through the shifted value if we don't demand all the bits/elts.
This is another step towards removing SelectionDAG::GetDemandedBits and just using TargetLowering::SimplifyMultipleUseDemandedBits.
There a few cases where we end up with extra register moves which I think we can accept in exchange for the increased ILP.
Differential Revision: https://reviews.llvm.org/D77804
APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
ST->getMemoryVT().getScalarSizeInBits());
- // See if we can simplify the input to this truncstore with knowledge that
- // only the low bits are being used. For example:
- // "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8"
+ // See if we can simplify the operation with SimplifyDemandedBits, which
+ // only works if the value has a single use.
AddToWorklist(Value.getNode());
- if (SDValue Shorter = DAG.GetDemandedBits(Value, TruncDemandedBits))
- return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(),
- ST->getMemOperand());
-
- // Otherwise, see if we can simplify the operation with
- // SimplifyDemandedBits, which only works if the value has a single use.
if (SimplifyDemandedBits(Value, TruncDemandedBits)) {
// Re-visit the store if anything changed and the store hasn't been merged
// with another node (N is deleted) SimplifyDemandedBits will add Value's
AddToWorklist(N);
return SDValue(N, 0);
}
+
+ // Otherwise, see if we can simplify the input to this truncstore with
+ // knowledge that only the low bits are being used. For example:
+ // "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8"
+ if (SDValue Shorter = DAG.GetDemandedBits(Value, TruncDemandedBits))
+ return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(),
+ ST->getMemOperand());
}
// If this is a load followed by a store to the same location, then the store
return getConstant(NewVal, SDLoc(V), V.getValueType());
break;
}
- case ISD::SRL:
- // Only look at single-use SRLs.
- if (!V.getNode()->hasOneUse())
- break;
- if (auto *RHSC = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
- // See if we can recursively simplify the LHS.
- unsigned Amt = RHSC->getZExtValue();
-
- // Watch out for shift count overflow though.
- if (Amt >= DemandedBits.getBitWidth())
- break;
- APInt SrcDemandedBits = DemandedBits << Amt;
- if (SDValue SimplifyLHS = TLI->SimplifyMultipleUseDemandedBits(
- V.getOperand(0), SrcDemandedBits, *this))
- return getNode(ISD::SRL, SDLoc(V), V.getValueType(), SimplifyLHS,
- V.getOperand(1));
- }
- break;
}
return SDValue();
}
Known.One.lshrInPlace(ShAmt);
// High bits known zero.
Known.Zero.setHighBits(ShAmt);
+
+ // Attempt to avoid multi-use ops if we don't need anything from them.
+ if (!InDemandedMask.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
+ SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+ Op0, InDemandedMask, DemandedElts, TLO.DAG, Depth + 1);
+ if (DemandedOp0) {
+ SDValue NewOp = TLO.DAG.getNode(ISD::SRL, dl, VT, DemandedOp0, Op1);
+ return TLO.CombineTo(Op, NewOp);
+ }
+ }
}
break;
}
; CHECK-LABEL: parity_17:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w0, #0x1ffff
-; CHECK-NEXT: eor w8, w8, w8, lsr #16
-; CHECK-NEXT: eor w8, w8, w8, lsr #8
+; CHECK-NEXT: eor w9, w8, w8, lsr #16
+; CHECK-NEXT: eor w8, w9, w8, lsr #8
; CHECK-NEXT: eor w8, w8, w8, lsr #4
; CHECK-NEXT: eor w8, w8, w8, lsr #2
; CHECK-NEXT: eor w8, w8, w8, lsr #1
define <1 x i64> @ssra_v1i64(<2 x i32> %0) {
; CHECK-LABEL: ssra_v1i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: bic v0.2s, #64, lsl #24
; CHECK-NEXT: ushr d1, d0, #63
+; CHECK-NEXT: bic v0.2s, #64, lsl #24
; CHECK-NEXT: ssra d1, d0, #62
; CHECK-NEXT: fmov d0, d1
; CHECK-NEXT: ret
define <2 x i64> @ssra_v2i64(<4 x i32> %0) {
; CHECK-LABEL: ssra_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: bic v0.4s, #64, lsl #24
; CHECK-NEXT: ushr v1.2d, v0.2d, #63
+; CHECK-NEXT: bic v0.4s, #64, lsl #24
; CHECK-NEXT: ssra v1.2d, v0.2d, #62
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: ret
; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24
; SI-NEXT: v_bfi_b32 v1, s4, v1, v2
; SI-NEXT: v_bfi_b32 v0, s4, v0, v3
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v2
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bswap_v2i16:
; SI-NEXT: v_bfi_b32 v2, s4, v2, v7
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v0, v0, v4
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v5
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bswap_v4i16:
; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0
-; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:1
-; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:2
-; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:3
-; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:4
-; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:5
+; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
+; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1
+; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2
+; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3
+; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4
+; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5
; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:6
; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:7
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5)
-; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v4 offset:2
-; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5)
-; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v5 offset:3
-; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v2
-; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v3 offset:1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v7, s1
+; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
+; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v5 offset:4
+; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
+; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v6 offset:5
+; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v1
+; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v2 offset:1
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5)
-; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v8 offset:6
+; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v8 offset:6
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5)
-; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v0 offset:7
-; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v6 offset:4
-; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v7 offset:5
+; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v0 offset:7
+; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v3 offset:2
+; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v4 offset:3
; ALIGNED-SDAG-NEXT: s_endpgm
;
; ALIGNED-GISEL-LABEL: ds8align1:
; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:2
+; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:4
; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0
; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:6
-; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:4
+; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:2
; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
-; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 offset:2
+; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 offset:4
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v2
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:6
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
-; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:4
+; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:2
; ALIGNED-SDAG-NEXT: s_endpgm
;
; ALIGNED-GISEL-LABEL: ds8align2:
; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v5 offset:4
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7)
; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v6 offset:5
-; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
-; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10
-; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
-; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11
+; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5)
; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v9 offset:8
+; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5)
; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v10 offset:9
+; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5)
+; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10
+; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5)
+; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11
; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v1
; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v2 offset:1
; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v3 offset:2
; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0
; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 offset:2
; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:4
-; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:10
-; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:8
+; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:8
+; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:10
; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6
; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s1
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v3 offset:4
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
-; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:10
+; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:8
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
-; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:8
+; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:10
; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1
; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v2 offset:2
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; SI-NEXT: v_or_b32_e32 v2, v6, v2
-; SI-NEXT: v_or_b32_e32 v3, v4, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; SI-NEXT: v_or_b32_e32 v3, v5, v3
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB0_3
; SI-NEXT: s_branch .LBB0_4
; SI-NEXT: s_mov_b32 s5, s6
; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT: v_or_b32_e32 v2, v4, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_or_b32_e32 v2, v2, v0
; SI-NEXT: v_or_b32_e32 v3, v3, v1
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; SI-NEXT: .LBB0_4: ; %exit
; SI-NEXT: v_bfe_i32 v0, v3, 0, 16
; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; SI-NEXT: v_or_b32_e32 v2, v3, v4
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_8xi16_extract_4xi16:
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; SI-NEXT: v_or_b32_e32 v2, v6, v2
-; SI-NEXT: v_or_b32_e32 v3, v4, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; SI-NEXT: v_or_b32_e32 v3, v6, v3
+; SI-NEXT: v_or_b32_e32 v5, v5, v7
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB1_3
; SI-NEXT: s_branch .LBB1_4
; SI-NEXT: .LBB1_2:
-; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB1_3: ; %T
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT: v_or_b32_e32 v2, v4, v0
-; SI-NEXT: v_or_b32_e32 v3, v3, v1
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v0
+; SI-NEXT: v_or_b32_e32 v5, v5, v1
; SI-NEXT: .LBB1_4: ; %exit
-; SI-NEXT: v_bfe_i32 v0, v3, 0, 16
-; SI-NEXT: v_bfe_i32 v1, v5, 0, 16
+; SI-NEXT: v_bfe_i32 v0, v5, 0, 16
+; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
+; SI-NEXT: v_bfe_i32 v3, v3, 0, 16
; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
-; SI-NEXT: v_bfe_i32 v3, v4, 0, 16
; SI-NEXT: v_mov_b32_e32 v4, 0xffff
; SI-NEXT: v_mov_b32_e32 v5, 0x8000
; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000
; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3
-; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
+; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
+; SI-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_or_b32_e32 v2, v3, v4
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_8xi16_extract_4xi16_2:
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v5
; SI-NEXT: v_or_b32_e32 v2, v6, v2
-; SI-NEXT: v_or_b32_e32 v4, v4, v3
+; SI-NEXT: v_or_b32_e32 v4, v4, v7
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v4
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB2_3
; SI-NEXT: s_branch .LBB2_4
; SI-NEXT: .LBB2_2:
-; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB2_3: ; %T
; SI-NEXT: v_or_b32_e32 v0, v4, v0
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: .LBB2_4: ; %exit
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v3
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000
; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; SI-NEXT: v_or_b32_e32 v2, v6, v2
-; SI-NEXT: v_or_b32_e32 v3, v4, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; SI-NEXT: v_or_b32_e32 v3, v5, v3
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB3_3
; SI-NEXT: s_branch .LBB3_4
; SI-NEXT: s_mov_b32 s5, s6
; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT: v_or_b32_e32 v2, v4, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_or_b32_e32 v2, v2, v0
; SI-NEXT: v_or_b32_e32 v3, v3, v1
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; SI-NEXT: .LBB3_4: ; %exit
; SI-NEXT: v_bfe_i32 v0, v3, 0, 16
; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; SI-NEXT: v_or_b32_e32 v2, v3, v4
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_16xi16_extract_4xi16:
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; SI-NEXT: v_or_b32_e32 v2, v6, v2
-; SI-NEXT: v_or_b32_e32 v3, v4, v3
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT: v_or_b32_e32 v2, v7, v2
+; SI-NEXT: v_or_b32_e32 v3, v6, v3
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB4_3
; SI-NEXT: s_branch .LBB4_4
; SI-NEXT: .LBB4_2:
; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $vgpr5
-; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB4_3: ; %T
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT: v_or_b32_e32 v2, v4, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_or_b32_e32 v2, v2, v0
; SI-NEXT: v_or_b32_e32 v3, v3, v1
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; SI-NEXT: .LBB4_4: ; %exit
; SI-NEXT: v_bfe_i32 v0, v3, 0, 16
-; SI-NEXT: v_bfe_i32 v1, v5, 0, 16
+; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
-; SI-NEXT: v_bfe_i32 v3, v4, 0, 16
+; SI-NEXT: v_bfe_i32 v3, v5, 0, 16
; SI-NEXT: v_mov_b32_e32 v4, 0xffff
; SI-NEXT: v_mov_b32_e32 v5, 0x8000
; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_16xi16_extract_4xi16_2:
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v5
; SI-NEXT: v_or_b32_e32 v2, v6, v2
-; SI-NEXT: v_or_b32_e32 v4, v4, v3
+; SI-NEXT: v_or_b32_e32 v4, v4, v7
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v4
-; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB5_3
; SI-NEXT: s_branch .LBB5_4
; SI-NEXT: .LBB5_2:
-; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB5_3: ; %T
; SI-NEXT: v_or_b32_e32 v0, v4, v0
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v4, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: .LBB5_4: ; %exit
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v3
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000
; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000
; CI-NEXT: flat_load_dword v0, v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_bfe_u32 v1, v0, 16, 15
-; CI-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; CI-NEXT: flat_store_short v[0:1], v0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: flat_store_short v[0:1], v1
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_or_b32_e32 v5, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5
-; SI-NEXT: v_or_b32_e32 v3, 16, v4
+; SI-NEXT: v_or_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_alignbit_b32 v0, v0, v2, v3
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5
+; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v3
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fshr_v2i16:
; SI-NEXT: v_or_b32_e32 v4, 16, v11
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; SI-NEXT: v_alignbit_b32 v3, v3, v5, v4
-; SI-NEXT: v_or_b32_e32 v4, 16, v10
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6
-; SI-NEXT: v_alignbit_b32 v2, v2, v5, v4
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v5, 16, v10
+; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT: v_alignbit_b32 v2, v2, v6, v5
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fshr_v4i16:
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8
+; GFX7-NEXT: v_bfe_i32 v3, v2, 16, 8
; GFX7-NEXT: v_bfe_i32 v4, v2, 0, 8
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX7-NEXT: v_ashrrev_i32_e32 v5, 24, v2
+; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_i32 v6, v0, 8, 8
+; GFX7-NEXT: v_bfe_i32 v6, v0, 16, 8
; GFX7-NEXT: v_bfe_i32 v7, v0, 0, 8
-; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v7
-; GFX7-NEXT: v_bfe_i32 v8, v0, 16, 8
-; GFX7-NEXT: v_or_b32_e32 v4, v6, v4
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX7-NEXT: v_ashrrev_i32_e32 v8, 24, v0
+; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 8
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX7-NEXT: v_alignbit_b32 v2, 0, v2, 16
+; GFX7-NEXT: v_alignbit_b32 v0, 0, v0, 16
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1
-; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0
-; GFX7-NEXT: v_mad_u32_u24 v1, v6, v8, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_mad_u32_u24 v1, v5, v7, v1
+; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v5, v8, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_and_b32_e32 v3, 0xff00, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v2
+; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v6, 0xff00, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0
-; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v6
-; GFX7-NEXT: v_or_b32_e32 v4, v7, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; GFX7-NEXT: v_alignbit_b32 v3, s10, v3, 16
+; GFX7-NEXT: v_alignbit_b32 v6, 0, v6, 16
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1
-; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8
-; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8
-; GFX7-NEXT: v_mad_u32_u24 v1, v6, v7, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
-; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v4, v7, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v5, v8, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v8, v4, v5
-; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v8, 16, v6
-; GFX9-NODL-NEXT: v_or_b32_e32 v7, v7, v8
-; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v7
+; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v7, v4, v5
+; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 8, v6
+; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v6
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v7
-; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v6
-; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1
; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v6
+; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1
+; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v8
; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v4, v5
-; GFX9-DL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v6
-; GFX9-DL-NEXT: v_or_b32_e32 v7, v7, v8
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v7
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v4, v5
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v6
+; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v6
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v7
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v6
-; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1
; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6
+; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1
+; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8
; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v2
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v2
-; GFX10-DL-NEXT: v_lshrrev_b16 v8, 8, v2
+; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v1
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX10-DL-NEXT: v_lshrrev_b16 v9, 8, v2
; GFX10-DL-NEXT: v_mul_lo_u16 v4, v4, v5
-; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v1
-; GFX10-DL-NEXT: v_mul_lo_u16 v9, v6, v7
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3
+; GFX10-DL-NEXT: v_mul_lo_u16 v5, v7, v8
+; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v9
; GFX10-DL-NEXT: v_lshlrev_b16 v4, 8, v4
-; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v8
-; GFX10-DL-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NEXT: v_lshlrev_b16 v5, 8, v5
-; GFX10-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v4
+; GFX10-DL-NEXT: v_lshlrev_b16 v6, 8, v6
+; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v4
-; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v5
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5
-; GFX10-DL-NEXT: v_mad_u16 v1, v6, v7, v1
+; GFX10-DL-NEXT: v_mad_u16 v1, v7, v8, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2
; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX10-DL-NEXT: s_endpgm
; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_bfe_i32 v3, v2, 20, 4
-; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 4
-; GFX7-NEXT: v_bfe_i32 v5, v2, 4, 4
-; GFX7-NEXT: v_bfe_i32 v6, v2, 0, 4
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX7-NEXT: v_bfe_i32 v8, v2, 0, 4
+; GFX7-NEXT: v_bfe_i32 v6, v2, 4, 4
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_i32 v10, v0, 20, 4
-; GFX7-NEXT: v_bfe_i32 v11, v0, 16, 4
-; GFX7-NEXT: v_bfe_i32 v12, v0, 4, 4
-; GFX7-NEXT: v_bfe_i32 v13, v0, 0, 4
-; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
-; GFX7-NEXT: v_or_b32_e32 v4, v6, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v10
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v11
-; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v12
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v13
-; GFX7-NEXT: v_bfe_i32 v14, v0, 24, 4
-; GFX7-NEXT: v_ashrrev_i32_e32 v16, 28, v0
-; GFX7-NEXT: v_or_b32_e32 v5, v6, v5
-; GFX7-NEXT: v_or_b32_e32 v6, v11, v10
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v14
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v16
-; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v6
+; GFX7-NEXT: v_bfe_i32 v15, v0, 0, 4
+; GFX7-NEXT: v_bfe_i32 v13, v0, 4, 4
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX7-NEXT: v_bfe_i32 v8, v2, 8, 4
-; GFX7-NEXT: v_bfe_i32 v15, v0, 8, 4
+; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v4, v6, v1
-; GFX7-NEXT: v_bfe_i32 v7, v2, 24, 4
+; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
+; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4
+; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4
+; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4
; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v2
; GFX7-NEXT: v_bfe_i32 v2, v2, 12, 4
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4
+; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4
+; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4
+; GFX7-NEXT: v_ashrrev_i32_e32 v16, 28, v0
; GFX7-NEXT: v_bfe_i32 v0, v0, 12, 4
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v15
-; GFX7-NEXT: v_mad_u32_u24 v1, v16, v11, v1
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_mad_u32_u24 v1, v8, v13, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v5, v0
+; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX7-NEXT: v_mad_u32_u24 v0, v15, v10, v0
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX7-NEXT: v_mad_u32_u24 v0, v7, v12, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v9, v14, v0
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_ashrrev_i32_e32 v3, 28, v2
-; GFX7-NEXT: v_bfe_i32 v4, v2, 24, 4
-; GFX7-NEXT: v_bfe_i32 v5, v2, 20, 4
-; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4
-; GFX7-NEXT: v_bfe_i32 v7, v2, 12, 4
-; GFX7-NEXT: v_bfe_i32 v8, v2, 8, 4
-; GFX7-NEXT: v_bfe_i32 v9, v2, 4, 4
-; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7
-; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX7-NEXT: v_bfe_i32 v7, v2, 0, 4
+; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_ashrrev_i32_e32 v10, 28, v0
-; GFX7-NEXT: v_bfe_i32 v11, v0, 24, 4
-; GFX7-NEXT: v_bfe_i32 v12, v0, 20, 4
-; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4
-; GFX7-NEXT: v_bfe_i32 v14, v0, 12, 4
-; GFX7-NEXT: v_bfe_i32 v15, v0, 8, 4
-; GFX7-NEXT: v_bfe_i32 v16, v0, 4, 4
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4
-; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
-; GFX7-NEXT: v_or_b32_e32 v4, v6, v5
-; GFX7-NEXT: v_or_b32_e32 v5, v8, v7
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v9
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v10
-; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v11
-; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v12
-; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v13
-; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v14
-; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v15
-; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v16
+; GFX7-NEXT: v_bfe_i32 v14, v0, 0, 4
+; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4
+; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4
+; GFX7-NEXT: v_bfe_i32 v6, v2, 8, 4
+; GFX7-NEXT: v_ashrrev_i32_e32 v8, 28, v2
+; GFX7-NEXT: v_bfe_i32 v9, v2, 12, 4
+; GFX7-NEXT: v_bfe_i32 v2, v2, 4, 4
+; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4
+; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4
+; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4
+; GFX7-NEXT: v_bfe_i32 v13, v0, 8, 4
+; GFX7-NEXT: v_ashrrev_i32_e32 v15, 28, v0
+; GFX7-NEXT: v_bfe_i32 v16, v0, 12, 4
+; GFX7-NEXT: v_bfe_i32 v0, v0, 4, 4
+; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14
+; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX7-NEXT: v_or_b32_e32 v6, v7, v6
-; GFX7-NEXT: v_or_b32_e32 v7, v9, v8
-; GFX7-NEXT: v_or_b32_e32 v8, v11, v10
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v12
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v8
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v13
-; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v2
-; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX7-NEXT: v_bfe_u32 v9, v2, 8, 8
-; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v8, v13, v1
-; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2
-; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
-; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v0
-; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
-; GFX7-NEXT: v_mad_u32_u24 v1, v9, v14, v1
+; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
+; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9
+; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 24, v16
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v4
-; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v5
-; GFX7-NEXT: v_mad_u32_u24 v0, v7, v12, v0
-; GFX7-NEXT: v_bfe_u32 v11, v4, 8, 8
-; GFX7-NEXT: v_bfe_u32 v16, v5, 8, 8
-; GFX7-NEXT: v_mad_u32_u24 v0, v10, v15, v0
-; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8
-; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8
-; GFX7-NEXT: v_mad_u32_u24 v0, v11, v16, v0
-; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8
-; GFX7-NEXT: v_bfe_u32 v6, v6, 8, 8
-; GFX7-NEXT: v_mad_u32_u24 v0, v4, v5, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0
+; GFX7-NEXT: v_alignbit_b32 v9, 0, v9, 24
+; GFX7-NEXT: v_alignbit_b32 v16, 0, v16, 24
+; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
+; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
+; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
+; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10
+; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
+; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8
+; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b32_e32 v10, 20, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1
; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v2
; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v9
; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v15
+; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8
+; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7
; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16
; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6
; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v1
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 12, v5
; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v14
; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v17
+; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13
+; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12
; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18
; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11
; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v0
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 12, v10
-; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8
-; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7
-; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13
-; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12
-; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1
-; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-NEXT: v_ashrrev_i16_e32 v0, 12, v0
; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8
; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
+; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
+; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1
; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v13
; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12
-; GFX9-NEXT: v_mul_lo_u16_e32 v19, v15, v17
-; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
+; GFX9-NEXT: v_ashrrev_i16_e32 v0, 12, v0
; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2
; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
; GFX9-NEXT: v_mul_lo_u16_e32 v13, v16, v18
+; GFX9-NEXT: v_mul_lo_u16_e32 v19, v15, v17
+; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_mul_lo_u16_e32 v7, v8, v10
-; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v14
; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v5, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1
-; GFX9-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v14
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-NEXT: v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v1
-; GFX9-NEXT: v_or_b32_sdwa v1, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_e32 v2, v2, v0
+; GFX9-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NEXT: v_or_b32_e32 v2, v2, v0
; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u16_e32 v1, v7, v4
-; GFX9-NEXT: v_add_u16_e32 v1, v1, v2
+; GFX9-NEXT: v_add_u16_e32 v2, v7, v4
+; GFX9-NEXT: v_add_u16_e32 v1, v2, v1
; GFX9-NEXT: v_add_u16_e32 v1, v1, v6
; GFX9-NEXT: v_add_u16_e32 v0, v1, v0
; GFX9-NEXT: v_mad_legacy_u16 v0, v16, v18, v0
; GFX9-NEXT: v_add_u16_e32 v0, v0, v5
; GFX9-NEXT: v_mad_legacy_u16 v0, v15, v17, v0
-; GFX9-NEXT: v_add_u16_e32 v0, v0, v9
+; GFX9-NEXT: v_add_u16_e32 v0, v0, v8
; GFX9-NEXT: global_store_byte v3, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 20, v2
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2
; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1
; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v2
; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2
; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v9
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v15
+; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8
+; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16
; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v1
; GFX9-DL-NEXT: v_lshlrev_b16_e32 v1, 12, v5
; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v14
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v17
+; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13
+; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18
; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v0
; GFX9-DL-NEXT: v_lshlrev_b16_e32 v0, 12, v10
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13
-; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v0, 12, v0
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v13
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v19, v15, v17
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v0, 12, v0
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v13, v16, v18
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v19, v15, v17
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v8, v10
-; GFX9-DL-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v14
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT: v_or_b32_sdwa v5, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v1
-; GFX9-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v14
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-DL-NEXT: v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v1
-; GFX9-DL-NEXT: v_or_b32_sdwa v1, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0
+; GFX9-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0
; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v2
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_add_u16_e32 v1, v7, v4
-; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2
+; GFX9-DL-NEXT: v_add_u16_e32 v2, v7, v4
+; GFX9-DL-NEXT: v_add_u16_e32 v1, v2, v1
; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6
; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0
; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v16, v18, v0
; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v5
; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v15, v17, v0
-; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v9
+; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8
; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v2
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1
+; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v2
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v15
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v0, 20, v1
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v2
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v2
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13
+; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v8, v8, v15
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v0, 12, v0
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v13
+; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v0, 12, v0
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v17
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v9, v16
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 8, v8
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v0, 12, v0
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v13
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11
-; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1
-; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v10, v15
; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
+; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12
-; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v0, v11
+; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2
+; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v0, v11
+; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 8, v10
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8
+; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v5, v12
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v11, v7, v14
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 8, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 8, v6
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 8, v10
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8
-; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2
-; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v5, v12
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 8, v9
-; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v6, v11, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v11, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v11
+; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v9, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v13
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v1, v3
; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v0
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v0
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v0
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v0
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v0
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v18, 12, v0
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v16
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v15
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 20, v1
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v0
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v8, v8, v15
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v0
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 12, v3
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v17
-; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v0, v9, v0
+; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v9, v9, v16
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 8, v8
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v3
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v11
-; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v6, v6, v13
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v10, v15
-; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
-; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v12
-; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v9
-; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v9, v7, v14
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 8, v6
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 8, v10
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14
+; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12
+; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v11
+; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v6, v6, v13
+; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v1, v1, v0
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 8, v10
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8
-; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v1, v1, v18
-; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v12, v5, v11
+; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v5, v12
+; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v11, v7, v14
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 8, v3
-; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v9, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v10, 16, v6
-; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v9
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 8, v6
+; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v9, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v13
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v2, v1, v2
-; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v9, v2, v9
+; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v9, v2, v10
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1]
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v9, v8
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v2
-; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v5, v11, v0
+; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v5, v12, v0
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v14, v0
; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_bfe_u32 v8, v2, 20, 4
-; GFX7-NEXT: v_lshlrev_b32_e32 v9, 12, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
-; GFX7-NEXT: v_bfe_u32 v5, v2, 12, 4
-; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 4
-; GFX7-NEXT: v_and_b32_e32 v7, 15, v2
-; GFX7-NEXT: v_alignbit_b32 v2, v8, v2, 16
-; GFX7-NEXT: v_and_b32_e32 v8, 0xf0000, v9
+; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
+; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
+; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
+; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
+; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
+; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshlrev_b32_e32 v9, 12, v0
-; GFX7-NEXT: v_and_b32_e32 v14, 15, v0
-; GFX7-NEXT: v_or_b32_e32 v7, v7, v8
-; GFX7-NEXT: v_and_b32_e32 v8, 0xf0000, v9
-; GFX7-NEXT: v_or_b32_e32 v8, v14, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 15, v7
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v8
-; GFX7-NEXT: v_and_b32_e32 v8, 15, v8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v7, v8, v1
-; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4
-; GFX7-NEXT: v_bfe_u32 v15, v0, 20, 4
-; GFX7-NEXT: v_mad_u32_u24 v1, v9, v14, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
-; GFX7-NEXT: v_bfe_u32 v12, v0, 12, 4
-; GFX7-NEXT: v_alignbit_b32 v0, v15, v0, 16
-; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v0
+; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
+; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4
+; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
+; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
+; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v16, v15, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 4, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v8, 12, v2
-; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 4
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 28, v2
-; GFX7-NEXT: v_bfe_u32 v7, v2, 16, 4
-; GFX7-NEXT: v_lshlrev_b32_e32 v9, 4, v2
+; GFX7-NEXT: v_and_b32_e32 v7, 15, v2
+; GFX7-NEXT: v_bfe_u32 v6, v2, 4, 4
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshrrev_b32_e32 v11, 4, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v13, 28, v0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xf00, v8
-; GFX7-NEXT: v_and_b32_e32 v4, 0xf00, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 15, v2
-; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 4
-; GFX7-NEXT: v_and_b32_e32 v12, 15, v0
-; GFX7-NEXT: v_bfe_u32 v14, v0, 16, 4
-; GFX7-NEXT: v_lshrrev_b32_e32 v15, 12, v0
-; GFX7-NEXT: v_alignbit_b32 v2, v6, v2, 24
-; GFX7-NEXT: v_and_b32_e32 v6, 0xf00, v9
-; GFX7-NEXT: v_lshlrev_b32_e32 v9, 4, v0
-; GFX7-NEXT: v_or_b32_e32 v7, v7, v8
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT: v_alignbit_b32 v0, v13, v0, 24
-; GFX7-NEXT: v_and_b32_e32 v8, 0xf00, v11
-; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
-; GFX7-NEXT: v_and_b32_e32 v4, 0xf00, v15
-; GFX7-NEXT: v_and_b32_e32 v6, 0xf00, v9
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0xf0f, v0
-; GFX7-NEXT: v_or_b32_e32 v8, v10, v8
-; GFX7-NEXT: v_and_b32_e32 v2, 0xf0f, v2
-; GFX7-NEXT: v_or_b32_e32 v4, v14, v4
-; GFX7-NEXT: v_or_b32_e32 v6, v12, v6
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v8
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX7-NEXT: v_or_b32_e32 v4, v6, v5
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT: v_and_b32_e32 v7, 15, v3
-; GFX7-NEXT: v_and_b32_e32 v13, 15, v4
-; GFX7-NEXT: v_bfe_u32 v8, v3, 8, 4
-; GFX7-NEXT: v_bfe_u32 v14, v4, 8, 4
+; GFX7-NEXT: v_and_b32_e32 v14, 15, v0
+; GFX7-NEXT: v_bfe_u32 v8, v2, 12, 4
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v2
+; GFX7-NEXT: v_bfe_u32 v13, v0, 4, 4
+; GFX7-NEXT: v_bfe_u32 v15, v0, 12, 4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v7, v13, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v3
-; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 4
-; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v4
-; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 4
-; GFX7-NEXT: v_mad_u32_u24 v1, v8, v14, v1
-; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1
-; GFX7-NEXT: v_and_b32_e32 v9, 15, v2
-; GFX7-NEXT: v_and_b32_e32 v15, 15, v0
-; GFX7-NEXT: v_mad_u32_u24 v1, v5, v11, v1
-; GFX7-NEXT: v_bfe_u32 v10, v2, 8, 4
-; GFX7-NEXT: v_bfe_u32 v16, v0, 8, 4
-; GFX7-NEXT: v_mad_u32_u24 v1, v9, v15, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2
-; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 4
-; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v0
-; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 4
-; GFX7-NEXT: v_mad_u32_u24 v1, v10, v16, v1
+; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
+; GFX7-NEXT: v_bfe_u32 v3, v2, 20, 4
+; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 4
+; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 4
+; GFX7-NEXT: v_bfe_u32 v12, v0, 8, 4
+; GFX7-NEXT: v_alignbit_b32 v2, v9, v2, 24
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v15
+; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
+; GFX7-NEXT: v_alignbit_b32 v8, 0, v8, 24
+; GFX7-NEXT: v_alignbit_b32 v7, 0, v9, 24
+; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
+; GFX7-NEXT: v_bfe_u32 v11, v0, 16, 4
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 28, v0
+; GFX7-NEXT: v_mad_u32_u24 v1, v8, v7, v1
+; GFX7-NEXT: v_bfe_u32 v10, v0, 20, 4
+; GFX7-NEXT: v_alignbit_b32 v0, v16, v0, 24
+; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 8, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 8, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
+; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v6, v12, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v15, v9, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3]
; GFX9-NEXT: s_addc_u32 s9, s9, 0
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 28, v1
-; GFX9-NEXT: v_bfe_u32 v9, v1, 24, 4
-; GFX9-NEXT: v_bfe_u32 v10, v1, 20, 4
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 28, v2
-; GFX9-NEXT: v_bfe_u32 v16, v2, 24, 4
-; GFX9-NEXT: v_bfe_u32 v17, v2, 20, 4
; GFX9-NEXT: v_bfe_u32 v0, v1, 4, 4
; GFX9-NEXT: v_and_b32_e32 v5, 15, v1
; GFX9-NEXT: v_bfe_u32 v6, v1, 12, 4
; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 28, v1
+; GFX9-NEXT: v_bfe_u32 v9, v1, 24, 4
+; GFX9-NEXT: v_bfe_u32 v10, v1, 20, 4
; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 4
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_bfe_u32 v1, v2, 4, 4
; GFX9-NEXT: v_and_b32_e32 v12, 15, v2
; GFX9-NEXT: v_bfe_u32 v13, v2, 12, 4
; GFX9-NEXT: v_bfe_u32 v14, v2, 8, 4
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 28, v2
+; GFX9-NEXT: v_bfe_u32 v16, v2, 24, 4
+; GFX9-NEXT: v_bfe_u32 v17, v2, 20, 4
; GFX9-NEXT: v_bfe_u32 v2, v2, 16, 4
+; GFX9-NEXT: v_mul_lo_u16_e32 v18, v11, v2
; GFX9-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_mul_lo_u16_e32 v17, v9, v16
; GFX9-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_mul_lo_u16_e32 v18, v11, v2
; GFX9-NEXT: v_mul_lo_u16_e32 v7, v7, v14
; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_e32 v8, v17, v8
; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12
; GFX9-NEXT: v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_e32 v1, v18, v10
+; GFX9-NEXT: v_or_b32_e32 v0, v18, v10
+; GFX9-NEXT: v_or_b32_sdwa v1, v17, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v6, v7, v6
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v8
+; GFX9-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v5, v5, v12
-; GFX9-NEXT: v_or_b32_e32 v7, v12, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-NEXT: v_or_b32_e32 v10, v12, v0
; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v10
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u16_e32 v1, v5, v4
-; GFX9-NEXT: v_add_u16_e32 v1, v1, v7
+; GFX9-NEXT: v_add_u16_e32 v4, v5, v4
+; GFX9-NEXT: v_add_u16_e32 v1, v4, v1
; GFX9-NEXT: v_add_u16_e32 v1, v1, v6
; GFX9-NEXT: v_add_u16_e32 v0, v1, v0
; GFX9-NEXT: v_mad_legacy_u16 v0, v11, v2, v0
-; GFX9-NEXT: v_add_u16_e32 v0, v0, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v8
-; GFX9-NEXT: v_mad_legacy_u16 v0, v9, v16, v0
; GFX9-NEXT: v_add_u16_e32 v0, v0, v8
+; GFX9-NEXT: v_mad_legacy_u16 v0, v9, v16, v0
+; GFX9-NEXT: v_add_u16_e32 v0, v0, v7
; GFX9-NEXT: global_store_byte v3, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3]
; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1
-; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 24, 4
-; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 20, 4
-; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 28, v2
-; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 24, 4
-; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 20, 4
; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 4, 4
; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1
; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 12, 4
; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 8, 4
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1
+; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 24, 4
+; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 20, 4
; GFX9-DL-NEXT: v_bfe_u32 v11, v1, 16, 4
+; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_bfe_u32 v1, v2, 4, 4
; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v2
; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 12, 4
; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 8, 4
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 28, v2
+; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 24, 4
+; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 20, 4
; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 16, 4
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v18, v11, v2
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v17, v9, v16
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v18, v11, v2
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v7, v14
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_or_b32_e32 v8, v17, v8
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_or_b32_e32 v1, v18, v10
+; GFX9-DL-NEXT: v_or_b32_e32 v0, v18, v10
+; GFX9-DL-NEXT: v_or_b32_sdwa v1, v17, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_or_b32_e32 v6, v7, v6
-; GFX9-DL-NEXT: v_lshlrev_b32_e32 v7, 16, v8
+; GFX9-DL-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX9-DL-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-DL-NEXT: v_or_b32_e32 v5, v5, v12
-; GFX9-DL-NEXT: v_or_b32_e32 v7, v12, v0
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v8
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-DL-NEXT: v_or_b32_e32 v10, v12, v0
; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1]
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v7
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v10
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_add_u16_e32 v1, v5, v4
-; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v7
+; GFX9-DL-NEXT: v_add_u16_e32 v4, v5, v4
+; GFX9-DL-NEXT: v_add_u16_e32 v1, v4, v1
; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6
; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0
; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v11, v2, v0
-; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v10
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v8
-; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v9, v16, v0
; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8
+; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v9, v16, v0
+; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v7
; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 12, 4
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4
-; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 8, 4
-; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2
-; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v10
+; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4
; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 4, 4
; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v1
-; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 24, 4
+; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 8, 4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1
+; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 24, 4
; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4
; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4
-; GFX10-DL-NEXT: v_bfe_u32 v1, v2, 4, 4
-; GFX10-DL-NEXT: v_mul_lo_u16 v7, v7, v13
+; GFX10-DL-NEXT: v_bfe_u32 v1, v2, 8, 4
+; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v9
+; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2
+; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 20, 4
+; GFX10-DL-NEXT: v_mul_lo_u16 v1, v7, v1
; GFX10-DL-NEXT: v_lshlrev_b16 v6, 8, v6
-; GFX10-DL-NEXT: v_and_b32_e32 v10, 15, v2
-; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 24, 4
-; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 20, 4
-; GFX10-DL-NEXT: v_bfe_u32 v16, v2, 16, 4
-; GFX10-DL-NEXT: v_mul_lo_u16 v2, v8, v14
-; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX10-DL-NEXT: v_or_b32_e32 v6, v7, v6
-; GFX10-DL-NEXT: v_mul_lo_u16 v1, v11, v13
-; GFX10-DL-NEXT: v_mul_lo_u16 v7, v9, v15
-; GFX10-DL-NEXT: v_lshlrev_b16 v2, 8, v2
-; GFX10-DL-NEXT: v_lshlrev_b16 v8, 8, v0
+; GFX10-DL-NEXT: v_and_b32_e32 v13, 15, v2
+; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v9
+; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 16, 4
+; GFX10-DL-NEXT: v_bfe_u32 v16, v2, 24, 4
+; GFX10-DL-NEXT: v_or_b32_e32 v6, v1, v6
+; GFX10-DL-NEXT: v_mul_lo_u16 v2, v11, v15
+; GFX10-DL-NEXT: v_mul_lo_u16 v8, v8, v14
+; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v0
+; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v13
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v10
-; GFX10-DL-NEXT: v_mul_lo_u16 v10, v12, v16
-; GFX10-DL-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX10-DL-NEXT: v_or_b32_e32 v7, v7, v2
-; GFX10-DL-NEXT: v_or_b32_sdwa v2, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-NEXT: v_or_b32_e32 v5, v5, v8
-; GFX10-DL-NEXT: v_or_b32_e32 v1, v10, v1
-; GFX10-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v7
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v2
+; GFX10-DL-NEXT: v_mul_lo_u16 v1, v12, v7
+; GFX10-DL-NEXT: v_mul_lo_u16 v11, v10, v16
+; GFX10-DL-NEXT: v_lshlrev_b16 v2, 8, v2
+; GFX10-DL-NEXT: v_lshlrev_b16 v8, 8, v8
+; GFX10-DL-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT: v_or_b32_e32 v5, v5, v9
+; GFX10-DL-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX10-DL-NEXT: v_or_b32_sdwa v2, v11, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v13
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3
-; GFX10-DL-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-NEXT: v_add_nc_u16 v5, v3, v2
+; GFX10-DL-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT: v_add_nc_u16 v5, v3, v9
; GFX10-DL-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1]
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX10-DL-NEXT: v_add_nc_u16 v0, v5, v6
; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2
-; GFX10-DL-NEXT: v_mad_u16 v0, v12, v16, v0
+; GFX10-DL-NEXT: v_mad_u16 v0, v12, v7, v0
; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v7
-; GFX10-DL-NEXT: v_mad_u16 v0, v9, v15, v0
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v8
+; GFX10-DL-NEXT: v_mad_u16 v0, v10, v16, v0
; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1]
; GFX10-DL-NEXT: s_endpgm
; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0
; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1
; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_saddsat_v2i16:
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s0, v0
-; SI-NEXT: s_lshl_b32 s1, s0, 8
-; SI-NEXT: s_or_b32 s0, s1, s0
-; SI-NEXT: s_and_b32 s1, s0, 0xff00
-; SI-NEXT: s_lshr_b32 s4, s0, 8
-; SI-NEXT: s_or_b32 s1, s4, s1
-; SI-NEXT: s_lshl_b32 s4, s1, 16
-; SI-NEXT: s_or_b32 s1, s1, s4
-; SI-NEXT: s_or_b32 s0, s0, s4
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v0
-; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
bb:
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_readfirstlane_b32 s0, v0
-; SI-NEXT: s_lshl_b32 s1, s0, 8
-; SI-NEXT: s_or_b32 s0, s1, s0
-; SI-NEXT: s_and_b32 s1, s0, 0xff00
-; SI-NEXT: s_lshr_b32 s4, s0, 8
-; SI-NEXT: s_or_b32 s1, s4, s1
-; SI-NEXT: s_lshl_b32 s4, s1, 16
-; SI-NEXT: s_or_b32 s1, s1, s4
-; SI-NEXT: s_or_b32 s0, s0, s4
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; GCN-LABEL: v_shl_i128_vk:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_alignbit_b32 v4, v2, v1, 15
+; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], 17
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 15, v1
+; GCN-NEXT: v_or_b32_e32 v2, v2, v4
; GCN-NEXT: v_alignbit_b32 v1, v1, v0, 15
-; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 15
; GCN-NEXT: v_lshlrev_b32_e32 v0, 17, v0
-; GCN-NEXT: v_mov_b32_e32 v2, v4
; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = shl i128 %lhs, 17
ret i128 %shl
; GCN-LABEL: v_ashr_i128_vk:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_ashr_i64 v[4:5], v[2:3], 33
-; GCN-NEXT: v_alignbit_b32 v0, v2, v1, 1
-; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 1
-; GCN-NEXT: v_mov_b32_e32 v2, v4
-; GCN-NEXT: v_mov_b32_e32 v3, v5
+; GCN-NEXT: v_mov_b32_e32 v4, v1
+; GCN-NEXT: v_lshl_b64 v[0:1], v[2:3], 31
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 1, v4
+; GCN-NEXT: v_ashr_i64 v[2:3], v[2:3], 33
+; GCN-NEXT: v_or_b32_e32 v0, v4, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = ashr i128 %lhs, 33
ret i128 %shl
; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0
; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1
; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_v2i16:
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
; HAWAII-NEXT: v_mov_b32_e32 v1, s5
; HAWAII-NEXT: flat_load_ubyte v0, v[0:1]
-; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x3
-; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x0
-; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x2
+; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0
+; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2
+; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3
; HAWAII-NEXT: s_mov_b32 m0, -1
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
-; HAWAII-NEXT: s_and_b32 s3, s0, 0xffff
-; HAWAII-NEXT: v_mov_b32_e32 v1, s1
-; HAWAII-NEXT: v_mov_b32_e32 v2, s0
+; HAWAII-NEXT: v_mov_b32_e32 v1, s0
+; HAWAII-NEXT: v_mov_b32_e32 v2, s1
; HAWAII-NEXT: v_mov_b32_e32 v3, s2
-; HAWAII-NEXT: ds_write_b16 v1, v2 offset:4
+; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4
; HAWAII-NEXT: s_waitcnt vmcnt(0)
-; HAWAII-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; HAWAII-NEXT: v_or_b32_e32 v0, s3, v0
-; HAWAII-NEXT: v_bfe_u32 v0, v0, 16, 7
+; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0
; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6
-; HAWAII-NEXT: ds_write_b32 v1, v3
+; HAWAII-NEXT: ds_write_b32 v1, v2
; HAWAII-NEXT: s_endpgm
;
; FIJI-LABEL: local_store_i55:
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: trunc_v2i64_arg_to_v2i16:
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_min_u32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_min_u32_e32 v0, 0xffff, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uaddsat_v2i16:
; GFX6-NEXT: v_max_u32_e32 v0, v0, v2
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_usubsat_v2i16:
;
; BE-LABEL: i56_or:
; BE: @ %bb.0:
-; BE-NEXT: ldr r1, [r0]
-; BE-NEXT: strb r1, [r0, #3]
-; BE-NEXT: ldrh r2, [r0, #4]!
-; BE-NEXT: ldrb r3, [r0, #2]
+; BE-NEXT: mov r1, r0
+; BE-NEXT: ldr r0, [r0]
+; BE-NEXT: ldrh r2, [r1, #4]!
+; BE-NEXT: ldrb r3, [r1, #2]
; BE-NEXT: orr r2, r3, r2, lsl #8
-; BE-NEXT: orr r1, r2, r1, lsl #24
-; BE-NEXT: orr r1, r1, #384
-; BE-NEXT: strb r1, [r0, #2]
-; BE-NEXT: lsr r1, r1, #8
-; BE-NEXT: strh r1, [r0]
+; BE-NEXT: orr r0, r2, r0, lsl #24
+; BE-NEXT: orr r0, r0, #384
+; BE-NEXT: strb r0, [r1, #2]
+; BE-NEXT: lsr r0, r0, #8
+; BE-NEXT: strh r0, [r1]
; BE-NEXT: mov pc, lr
%aa = load i56, i56* %a
%b = or i56 %aa, 384
;
; BE-LABEL: i56_and_or:
; BE: @ %bb.0:
-; BE-NEXT: ldr r1, [r0]
+; BE-NEXT: ldrh r1, [r0, #4]!
; BE-NEXT: mov r2, #128
-; BE-NEXT: strb r1, [r0, #3]
-; BE-NEXT: ldrh r12, [r0, #4]!
-; BE-NEXT: ldrb r3, [r0, #2]
+; BE-NEXT: orr r1, r1, #1
; BE-NEXT: strb r2, [r0, #2]
-; BE-NEXT: orr r2, r3, r12, lsl #8
-; BE-NEXT: orr r1, r2, r1, lsl #24
-; BE-NEXT: orr r1, r1, #384
-; BE-NEXT: lsr r1, r1, #8
; BE-NEXT: strh r1, [r0]
; BE-NEXT: mov pc, lr
;
; BE-LABEL: i56_insert_bit:
; BE: @ %bb.0:
-; BE-NEXT: ldr r2, [r0]
-; BE-NEXT: strb r2, [r0, #3]
-; BE-NEXT: ldrh r12, [r0, #4]!
-; BE-NEXT: ldrb r3, [r0, #2]
-; BE-NEXT: orr r3, r3, r12, lsl #8
-; BE-NEXT: orr r2, r3, r2, lsl #24
-; BE-NEXT: bic r2, r2, #8192
+; BE-NEXT: ldrh r2, [r0, #4]!
+; BE-NEXT: mov r3, #57088
+; BE-NEXT: orr r3, r3, #16711680
+; BE-NEXT: and r2, r3, r2, lsl #8
; BE-NEXT: orr r1, r2, r1, lsl #13
; BE-NEXT: lsr r1, r1, #8
; BE-NEXT: strh r1, [r0]
; CHECK-LABEL: parity_17:
; CHECK: @ %bb.0:
; CHECK-NEXT: bfc r0, #17, #15
-; CHECK-NEXT: eor r0, r0, r0, lsr #16
-; CHECK-NEXT: eor r0, r0, r0, lsr #8
+; CHECK-NEXT: eor r1, r0, r0, lsr #16
+; CHECK-NEXT: eor r0, r1, r0, lsr #8
; CHECK-NEXT: eor r0, r0, r0, lsr #4
; CHECK-NEXT: eor r0, r0, r0, lsr #2
; CHECK-NEXT: eor r0, r0, r0, lsr #1
; PPC64-NEXT: addi 3, 5, 0
; PPC64-NEXT: .LBB2_2: # %entry
; PPC64-NEXT: sradi 4, 3, 53
-; PPC64-NEXT: clrldi 5, 3, 63
+; PPC64-NEXT: rldicl 5, 3, 63, 1
; PPC64-NEXT: addi 4, 4, 1
+; PPC64-NEXT: clrldi 6, 3, 63
; PPC64-NEXT: cmpldi 4, 1
-; PPC64-NEXT: rldicl 4, 3, 63, 1
-; PPC64-NEXT: or 5, 5, 4
-; PPC64-NEXT: rldicl 6, 5, 11, 53
-; PPC64-NEXT: addi 6, 6, 1
-; PPC64-NEXT: clrldi 7, 5, 53
-; PPC64-NEXT: cmpldi 1, 6, 1
-; PPC64-NEXT: clrldi 6, 3, 53
+; PPC64-NEXT: clrldi 4, 3, 53
+; PPC64-NEXT: or 6, 6, 5
+; PPC64-NEXT: clrldi 7, 6, 53
+; PPC64-NEXT: addi 4, 4, 2047
; PPC64-NEXT: addi 7, 7, 2047
-; PPC64-NEXT: addi 6, 6, 2047
-; PPC64-NEXT: or 4, 7, 4
-; PPC64-NEXT: or 6, 6, 3
-; PPC64-NEXT: rldicl 4, 4, 53, 11
-; PPC64-NEXT: rldicr 6, 6, 0, 52
+; PPC64-NEXT: or 4, 4, 3
+; PPC64-NEXT: or 5, 7, 5
+; PPC64-NEXT: rldicl 7, 3, 10, 54
+; PPC64-NEXT: rldicr 4, 4, 0, 52
+; PPC64-NEXT: addi 7, 7, 1
; PPC64-NEXT: bc 12, 1, .LBB2_4
; PPC64-NEXT: # %bb.3: # %entry
-; PPC64-NEXT: ori 6, 3, 0
+; PPC64-NEXT: ori 4, 3, 0
; PPC64-NEXT: b .LBB2_4
; PPC64-NEXT: .LBB2_4: # %entry
-; PPC64-NEXT: rldicl 4, 4, 11, 1
-; PPC64-NEXT: cmpdi 3, 0
-; PPC64-NEXT: std 6, -32(1)
-; PPC64-NEXT: bc 12, 5, .LBB2_6
+; PPC64-NEXT: rldicl 5, 5, 53, 11
+; PPC64-NEXT: std 4, -32(1)
+; PPC64-NEXT: rldicl 4, 5, 11, 1
+; PPC64-NEXT: cmpldi 7, 1
+; PPC64-NEXT: bc 12, 1, .LBB2_6
; PPC64-NEXT: # %bb.5: # %entry
-; PPC64-NEXT: ori 4, 5, 0
+; PPC64-NEXT: ori 4, 6, 0
; PPC64-NEXT: b .LBB2_6
; PPC64-NEXT: .LBB2_6: # %entry
+; PPC64-NEXT: cmpdi 3, 0
; PPC64-NEXT: std 4, -24(1)
; PPC64-NEXT: bc 12, 0, .LBB2_8
; PPC64-NEXT: # %bb.7: # %entry
;
; RV64I-LABEL: test_bswap_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: srliw a1, a0, 8
+; RV64I-NEXT: srli a1, a0, 8
; RV64I-NEXT: lui a2, 16
; RV64I-NEXT: addiw a2, a2, -256
; RV64I-NEXT: and a1, a1, a2
;
; RV64I-LABEL: test_bitreverse_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: srliw a1, a0, 8
+; RV64I-NEXT: srli a1, a0, 8
; RV64I-NEXT: lui a2, 16
; RV64I-NEXT: addiw a2, a2, -256
; RV64I-NEXT: and a1, a1, a2
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: lui a1, 209715
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: and a2, a0, a1
; RV64M-NEXT: lui a2, 349525
; RV64M-NEXT: addiw a2, a2, 1365
; RV64M-NEXT: and a1, a1, a2
-; RV64M-NEXT: subw a0, a0, a1
+; RV64M-NEXT: sub a0, a0, a1
; RV64M-NEXT: lui a1, 209715
; RV64M-NEXT: addiw a1, a1, 819
; RV64M-NEXT: and a2, a0, a1
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: lui a1, 209715
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: and a2, a0, a1
; RV64M-NEXT: lui a2, 349525
; RV64M-NEXT: addiw a2, a2, 1365
; RV64M-NEXT: and a1, a1, a2
-; RV64M-NEXT: subw a0, a0, a1
+; RV64M-NEXT: sub a0, a0, a1
; RV64M-NEXT: lui a1, 209715
; RV64M-NEXT: addiw a1, a1, 819
; RV64M-NEXT: and a2, a0, a1
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: lui a1, 209715
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: and a2, a0, a1
; RV64M-NEXT: lui a2, 349525
; RV64M-NEXT: addiw a2, a2, 1365
; RV64M-NEXT: and a1, a1, a2
-; RV64M-NEXT: subw a0, a0, a1
+; RV64M-NEXT: sub a0, a0, a1
; RV64M-NEXT: lui a1, 209715
; RV64M-NEXT: addiw a1, a1, 819
; RV64M-NEXT: and a2, a0, a1
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: lui a1, 209715
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: and a2, a0, a1
; RV64M-NEXT: lui a2, 349525
; RV64M-NEXT: addiw a2, a2, 1365
; RV64M-NEXT: and a1, a1, a2
-; RV64M-NEXT: subw a0, a0, a1
+; RV64M-NEXT: sub a0, a0, a1
; RV64M-NEXT: lui a1, 209715
; RV64M-NEXT: addiw a1, a1, 819
; RV64M-NEXT: and a2, a0, a1
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: lui a1, 209715
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: and a2, a0, a1
; RV64M-NEXT: lui a2, 349525
; RV64M-NEXT: addiw a2, a2, 1365
; RV64M-NEXT: and a1, a1, a2
-; RV64M-NEXT: subw a0, a0, a1
+; RV64M-NEXT: sub a0, a0, a1
; RV64M-NEXT: lui a1, 209715
; RV64M-NEXT: addiw a1, a1, 819
; RV64M-NEXT: and a2, a0, a1
; CHECK-LABEL: roriw_bug:
; CHECK: # %bb.0:
; CHECK-NEXT: slli a1, a0, 31
-; CHECK-NEXT: andi a0, a0, -2
-; CHECK-NEXT: srli a2, a0, 1
-; CHECK-NEXT: or a1, a1, a2
-; CHECK-NEXT: sext.w a1, a1
-; CHECK-NEXT: xor a0, a0, a1
+; CHECK-NEXT: andi a2, a0, -2
+; CHECK-NEXT: srli a0, a0, 1
+; CHECK-NEXT: or a0, a1, a0
+; CHECK-NEXT: sext.w a0, a0
+; CHECK-NEXT: xor a0, a2, a0
; CHECK-NEXT: ret
%a = shl i64 %x, 31
%b = and i64 %x, 18446744073709551614
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: lui a1, 209715
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: and a2, a0, a1
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: lui a1, 209715
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: and a2, a0, a1
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: lui a1, 209715
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: and a2, a0, a1
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: lui a1, 209715
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: and a2, a0, a1
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: lui a1, 209715
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: and a2, a0, a1
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: lui a1, 209715
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: and a2, a0, a1
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: lui a1, 209715
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: and a2, a0, a1
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: lui a1, 209715
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: and a2, a0, a1
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: lui a1, 209715
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: and a2, a0, a1
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: lui a1, 209715
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: and a2, a0, a1
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: addiw a2, a2, 1365
; RV64I-NEXT: and a1, a1, a2
-; RV64I-NEXT: subw a0, a0, a1
+; RV64I-NEXT: sub a0, a0, a1
; RV64I-NEXT: lui a1, 209715
; RV64I-NEXT: addiw a1, a1, 819
; RV64I-NEXT: and a2, a0, a1
define signext i32 @bswap_i32(i32 signext %a) nounwind {
; RV64I-LABEL: bswap_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: srliw a1, a0, 8
+; RV64I-NEXT: srli a1, a0, 8
; RV64I-NEXT: lui a2, 16
; RV64I-NEXT: addiw a2, a2, -256
; RV64I-NEXT: and a1, a1, a2
define void @bswap_i32_nosext(i32 signext %a, i32* %x) nounwind {
; RV64I-LABEL: bswap_i32_nosext:
; RV64I: # %bb.0:
-; RV64I-NEXT: srliw a2, a0, 8
+; RV64I-NEXT: srli a2, a0, 8
; RV64I-NEXT: lui a3, 16
; RV64I-NEXT: addiw a3, a3, -256
; RV64I-NEXT: and a2, a2, a3
define signext i32 @bswap_i32(i32 signext %a) nounwind {
; RV64I-LABEL: bswap_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: srliw a1, a0, 8
+; RV64I-NEXT: srli a1, a0, 8
; RV64I-NEXT: lui a2, 16
; RV64I-NEXT: addiw a2, a2, -256
; RV64I-NEXT: and a1, a1, a2
define void @bswap_i32_nosext(i32 signext %a, i32* %x) nounwind {
; RV64I-LABEL: bswap_i32_nosext:
; RV64I: # %bb.0:
-; RV64I-NEXT: srliw a2, a0, 8
+; RV64I-NEXT: srli a2, a0, 8
; RV64I-NEXT: lui a3, 16
; RV64I-NEXT: addiw a3, a3, -256
; RV64I-NEXT: and a2, a2, a3
define signext i32 @bitreverse_i32(i32 signext %a) nounwind {
; RV64I-LABEL: bitreverse_i32:
; RV64I: # %bb.0:
-; RV64I-NEXT: srliw a1, a0, 8
+; RV64I-NEXT: srli a1, a0, 8
; RV64I-NEXT: lui a2, 16
; RV64I-NEXT: addiw a2, a2, -256
; RV64I-NEXT: and a1, a1, a2
define void @bitreverse_i32_nosext(i32 signext %a, i32* %x) nounwind {
; RV64I-LABEL: bitreverse_i32_nosext:
; RV64I: # %bb.0:
-; RV64I-NEXT: srliw a2, a0, 8
+; RV64I-NEXT: srli a2, a0, 8
; RV64I-NEXT: lui a3, 16
; RV64I-NEXT: addiw a3, a3, -256
; RV64I-NEXT: and a2, a2, a3
; RV64I-NEXT: slli a2, a0, 24
; RV64I-NEXT: or a1, a2, a1
; RV64I-NEXT: srliw a2, a0, 24
-; RV64I-NEXT: srliw a0, a0, 16
+; RV64I-NEXT: srli a0, a0, 16
; RV64I-NEXT: slli a0, a0, 8
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: slliw a0, a0, 16
; RV64I-LABEL: bswap_rotl_i32:
; RV64I: # %bb.0:
; RV64I-NEXT: srliw a1, a0, 24
-; RV64I-NEXT: srliw a2, a0, 16
+; RV64I-NEXT: srli a2, a0, 16
; RV64I-NEXT: slli a2, a2, 8
; RV64I-NEXT: or a1, a2, a1
; RV64I-NEXT: slli a2, a0, 8
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: srli a0, a0, 1
; RV64I-NEXT: and a0, a0, s0
-; RV64I-NEXT: subw a0, a1, a0
+; RV64I-NEXT: sub a0, a1, a0
; RV64I-NEXT: and a2, a0, s1
; RV64I-NEXT: srli a0, a0, 2
; RV64I-NEXT: and a0, a0, s1
; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
; CHECK-NEXT: .cfi_offset %r14, -48
; CHECK-NEXT: .cfi_offset %r15, -40
-; CHECK-NEXT: vlgvf %r0, %v26, 3
-; CHECK-NEXT: vlgvf %r4, %v24, 1
-; CHECK-NEXT: vlgvf %r3, %v24, 2
-; CHECK-NEXT: srlk %r1, %r0, 8
+; CHECK-NEXT: vlgvf %r1, %v26, 3
+; CHECK-NEXT: vlgvf %r0, %v26, 2
+; CHECK-NEXT: stc %r1, 30(%r2)
+; CHECK-NEXT: srlk %r3, %r1, 8
+; CHECK-NEXT: risbgn %r1, %r1, 33, 167, 0
+; CHECK-NEXT: vlgvf %r5, %v24, 2
+; CHECK-NEXT: rosbg %r1, %r0, 2, 32, 31
+; CHECK-NEXT: sth %r3, 28(%r2)
+; CHECK-NEXT: srlg %r1, %r1, 24
+; CHECK-NEXT: vlgvf %r3, %v24, 3
+; CHECK-NEXT: st %r1, 24(%r2)
+; CHECK-NEXT: vlgvf %r1, %v26, 0
+; CHECK-NEXT: risbgn %r14, %r5, 6, 164, 27
+; CHECK-NEXT: sllg %r4, %r3, 60
+; CHECK-NEXT: rosbg %r14, %r3, 37, 63, 60
+; CHECK-NEXT: sllg %r3, %r14, 8
+; CHECK-NEXT: rosbg %r4, %r1, 4, 34, 29
+; CHECK-NEXT: rosbg %r3, %r4, 56, 63, 8
+; CHECK-NEXT: stg %r3, 8(%r2)
+; CHECK-NEXT: vlgvf %r3, %v24, 1
+; CHECK-NEXT: sllg %r4, %r3, 58
+; CHECK-NEXT: rosbg %r4, %r5, 6, 36, 27
; CHECK-NEXT: vlgvf %r5, %v24, 0
-; CHECK-NEXT: sth %r1, 28(%r2)
-; CHECK-NEXT: sllg %r1, %r4, 58
; CHECK-NEXT: sllg %r5, %r5, 25
-; CHECK-NEXT: stc %r0, 30(%r2)
-; CHECK-NEXT: rosbg %r1, %r3, 6, 36, 27
-; CHECK-NEXT: vlgvf %r3, %v24, 3
-; CHECK-NEXT: rosbg %r5, %r4, 39, 63, 58
-; CHECK-NEXT: sllg %r4, %r5, 8
-; CHECK-NEXT: rosbg %r1, %r3, 37, 63, 60
-; CHECK-NEXT: vlgvf %r5, %v26, 1
-; CHECK-NEXT: rosbg %r4, %r1, 56, 63, 8
-; CHECK-NEXT: stg %r4, 0(%r2)
-; CHECK-NEXT: vlgvf %r4, %v26, 2
-; CHECK-NEXT: sllg %r14, %r5, 62
-; CHECK-NEXT: sllg %r3, %r3, 60
-; CHECK-NEXT: rosbg %r14, %r4, 2, 32, 31
-; CHECK-NEXT: rosbg %r14, %r0, 33, 63, 0
-; CHECK-NEXT: srlg %r0, %r14, 24
-; CHECK-NEXT: st %r0, 24(%r2)
-; CHECK-NEXT: vlgvf %r0, %v26, 0
-; CHECK-NEXT: rosbg %r3, %r0, 4, 34, 29
-; CHECK-NEXT: sllg %r0, %r1, 8
-; CHECK-NEXT: rosbg %r3, %r5, 35, 63, 62
-; CHECK-NEXT: rosbg %r0, %r3, 56, 63, 8
-; CHECK-NEXT: stg %r0, 8(%r2)
-; CHECK-NEXT: sllg %r0, %r3, 8
-; CHECK-NEXT: rosbg %r0, %r14, 56, 63, 8
+; CHECK-NEXT: rosbg %r5, %r3, 39, 63, 58
+; CHECK-NEXT: sllg %r3, %r5, 8
+; CHECK-NEXT: rosbg %r3, %r4, 56, 63, 8
+; CHECK-NEXT: stg %r3, 0(%r2)
+; CHECK-NEXT: vlgvf %r3, %v26, 1
+; CHECK-NEXT: sllg %r4, %r3, 62
+; CHECK-NEXT: rosbg %r4, %r0, 2, 32, 31
+; CHECK-NEXT: risbgn %r0, %r1, 4, 162, 29
+; CHECK-NEXT: rosbg %r0, %r3, 35, 63, 62
+; CHECK-NEXT: sllg %r0, %r0, 8
+; CHECK-NEXT: rosbg %r0, %r4, 56, 63, 8
; CHECK-NEXT: stg %r0, 16(%r2)
; CHECK-NEXT: lmg %r14, %r15, 112(%r15)
; CHECK-NEXT: br %r14
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andb $15, %al
+; X86-NEXT: andb $8, %al
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: addb %cl, %dl
; X86-NEXT: andb $4, %dl
-; X86-NEXT: shlb $3, %cl
-; X86-NEXT: andb $8, %cl
-; X86-NEXT: orb %dl, %cl
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: shrb %dl
-; X86-NEXT: andb $2, %dl
-; X86-NEXT: orb %cl, %dl
+; X86-NEXT: movb %cl, %ah
+; X86-NEXT: shlb $3, %ah
+; X86-NEXT: andb $8, %ah
+; X86-NEXT: orb %dl, %ah
+; X86-NEXT: shrb %cl
+; X86-NEXT: andb $2, %cl
+; X86-NEXT: orb %ah, %cl
; X86-NEXT: shrb $3, %al
-; X86-NEXT: orb %dl, %al
+; X86-NEXT: orb %cl, %al
; X86-NEXT: retl
;
; X64-LABEL: test_bitreverse_i4:
; X64: # %bb.0:
; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: leal (%rdi,%rdi), %ecx
-; X64-NEXT: leal (,%rdi,8), %edx
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andb $15, %al
+; X64-NEXT: andb $8, %al
+; X64-NEXT: leal (%rdi,%rdi), %ecx
; X64-NEXT: andb $4, %cl
+; X64-NEXT: leal (,%rdi,8), %edx
; X64-NEXT: andb $8, %dl
; X64-NEXT: orb %cl, %dl
-; X64-NEXT: movl %eax, %ecx
-; X64-NEXT: shrb %cl
-; X64-NEXT: andb $2, %cl
-; X64-NEXT: orb %dl, %cl
+; X64-NEXT: shrb %dil
+; X64-NEXT: andb $2, %dil
+; X64-NEXT: orb %dil, %dl
; X64-NEXT: shrb $3, %al
-; X64-NEXT: orb %cl, %al
+; X64-NEXT: orb %dl, %al
; X64-NEXT: retq
;
; X86XOP-LABEL: test_bitreverse_i4:
;
; NO-POPCOUNT-LABEL: test4:
; NO-POPCOUNT: # %bb.0:
-; NO-POPCOUNT-NEXT: andb $127, %dil
-; NO-POPCOUNT-NEXT: movl %edi, %eax
-; NO-POPCOUNT-NEXT: shrb %al
-; NO-POPCOUNT-NEXT: andb $21, %al
-; NO-POPCOUNT-NEXT: subb %al, %dil
; NO-POPCOUNT-NEXT: movl %edi, %ecx
+; NO-POPCOUNT-NEXT: andb $127, %cl
+; NO-POPCOUNT-NEXT: shrb %dil
+; NO-POPCOUNT-NEXT: andb $21, %dil
+; NO-POPCOUNT-NEXT: subb %dil, %cl
+; NO-POPCOUNT-NEXT: movl %ecx, %eax
+; NO-POPCOUNT-NEXT: andb $51, %al
+; NO-POPCOUNT-NEXT: shrb $2, %cl
; NO-POPCOUNT-NEXT: andb $51, %cl
-; NO-POPCOUNT-NEXT: shrb $2, %dil
-; NO-POPCOUNT-NEXT: andb $51, %dil
-; NO-POPCOUNT-NEXT: addb %dil, %cl
+; NO-POPCOUNT-NEXT: addb %al, %cl
; NO-POPCOUNT-NEXT: movl %ecx, %eax
; NO-POPCOUNT-NEXT: shrb $4, %al
; NO-POPCOUNT-NEXT: addb %cl, %al
; X86-NEXT: shll $16, %ecx
; X86-NEXT: orl %edx, %ecx
; X86-NEXT: orl $384, %ecx # imm = 0x180
-; X86-NEXT: andl $16777088, %ecx # imm = 0xFFFF80
+; X86-NEXT: andl $-128, %ecx
; X86-NEXT: movw %cx, (%eax)
; X86-NEXT: retl
;
; X64-NEXT: shll $16, %ecx
; X64-NEXT: orl %eax, %ecx
; X64-NEXT: orl $384, %ecx # imm = 0x180
-; X64-NEXT: andl $16777088, %ecx # imm = 0xFFFF80
+; X64-NEXT: andl $-128, %ecx
; X64-NEXT: movw %cx, (%rdi)
; X64-NEXT: retq
%b = load i24, ptr %a, align 1
;
; X64-LABEL: i56_or:
; X64: # %bb.0:
-; X64-NEXT: movzwl 4(%rdi), %eax
-; X64-NEXT: movzbl 6(%rdi), %ecx
-; X64-NEXT: movb %cl, 6(%rdi)
-; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
-; X64-NEXT: shll $16, %ecx
-; X64-NEXT: orl %eax, %ecx
-; X64-NEXT: shlq $32, %rcx
-; X64-NEXT: movl (%rdi), %eax
-; X64-NEXT: orq %rcx, %rax
-; X64-NEXT: orq $384, %rax # imm = 0x180
-; X64-NEXT: movl %eax, (%rdi)
-; X64-NEXT: shrq $32, %rax
-; X64-NEXT: movw %ax, 4(%rdi)
+; X64-NEXT: movzbl 6(%rdi), %eax
+; X64-NEXT: shll $16, %eax
+; X64-NEXT: movzwl 4(%rdi), %ecx
+; X64-NEXT: movw %cx, 4(%rdi)
+; X64-NEXT: shrq $16, %rax
+; X64-NEXT: movb %al, 6(%rdi)
+; X64-NEXT: orl $384, (%rdi) # imm = 0x180
; X64-NEXT: retq
%aa = load i56, ptr %a, align 1
%b = or i56 %aa, 384
;
; X64-LABEL: i56_insert_bit:
; X64: # %bb.0:
-; X64-NEXT: movl %esi, %eax
-; X64-NEXT: movzwl 4(%rdi), %ecx
-; X64-NEXT: movzbl 6(%rdi), %edx
-; X64-NEXT: movb %dl, 6(%rdi)
-; X64-NEXT: # kill: def $edx killed $edx def $rdx
-; X64-NEXT: shll $16, %edx
-; X64-NEXT: orl %ecx, %edx
-; X64-NEXT: shlq $32, %rdx
-; X64-NEXT: movl (%rdi), %ecx
-; X64-NEXT: orq %rdx, %rcx
-; X64-NEXT: shlq $13, %rax
-; X64-NEXT: andq $-8193, %rcx # imm = 0xDFFF
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: movl %ecx, (%rdi)
-; X64-NEXT: shrq $32, %rcx
-; X64-NEXT: movw %cx, 4(%rdi)
+; X64-NEXT: movzwl 4(%rdi), %eax
+; X64-NEXT: movzbl 6(%rdi), %ecx
+; X64-NEXT: movb %cl, 6(%rdi)
+; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
+; X64-NEXT: shll $16, %ecx
+; X64-NEXT: orl %eax, %ecx
+; X64-NEXT: shlq $32, %rcx
+; X64-NEXT: movl (%rdi), %eax
+; X64-NEXT: orq %rcx, %rax
+; X64-NEXT: shll $13, %esi
+; X64-NEXT: andq $-8193, %rax # imm = 0xDFFF
+; X64-NEXT: orl %eax, %esi
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: movw %ax, 4(%rdi)
+; X64-NEXT: movl %esi, (%rdi)
; X64-NEXT: retq
%extbit = zext i1 %bit to i56
%b = load i56, ptr %a, align 1
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-- -mattr=-bmi | FileCheck %s
+; TODO: This might not be testing the original issue anymore? Should the movl still be removed?
define fastcc i32 @t() nounwind {
; CHECK-LABEL: t:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movzwl 0, %eax
-; CHECK-NEXT: orl $2, %eax
-; CHECK-NEXT: movw %ax, 0
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: orl $2, %ecx
+; CHECK-NEXT: movw %cx, 0
; CHECK-NEXT: shrl $3, %eax
; CHECK-NEXT: andl $1, %eax
; CHECK-NEXT: retl
; CHECK-NEXT: movzbl -9(%rsp), %ecx
; CHECK-NEXT: movzbl -10(%rsp), %edx
; CHECK-NEXT: movzbl -11(%rsp), %esi
+; CHECK-NEXT: movzbl %cl, %edi
+; CHECK-NEXT: shrb %cl
+; CHECK-NEXT: movb %cl, -2(%rsp)
; CHECK-NEXT: andl $31, %eax
; CHECK-NEXT: andl $31, %esi
; CHECK-NEXT: shll $5, %esi
; CHECK-NEXT: andl $31, %edx
; CHECK-NEXT: shll $10, %edx
; CHECK-NEXT: orl %esi, %edx
-; CHECK-NEXT: movzbl %cl, %eax
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shll $15, %ecx
-; CHECK-NEXT: orl %edx, %ecx
-; CHECK-NEXT: movw %cx, -4(%rsp)
-; CHECK-NEXT: shrl $16, %ecx
-; CHECK-NEXT: andl $15, %ecx
-; CHECK-NEXT: movb %cl, -2(%rsp)
-; CHECK-NEXT: movb %al, -5(%rsp)
-; CHECK-NEXT: cmpb $31, %al
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: shll $15, %eax
+; CHECK-NEXT: orl %edx, %eax
+; CHECK-NEXT: movw %ax, -4(%rsp)
+; CHECK-NEXT: movb %dil, -5(%rsp)
+; CHECK-NEXT: cmpb $31, %dil
; CHECK-NEXT: je .LBB0_2
; CHECK-NEXT: # %bb.1: # %Then
; CHECK-NEXT: int3
; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: movzbl %al, %eax
-; AVX512F-NEXT: movl %eax, %ecx
-; AVX512F-NEXT: shrl %ecx
-; AVX512F-NEXT: andl $-43, %ecx
-; AVX512F-NEXT: subl %ecx, %eax
-; AVX512F-NEXT: movl %eax, %ecx
-; AVX512F-NEXT: andl $858993459, %ecx ## imm = 0x33333333
-; AVX512F-NEXT: shrl $2, %eax
+; AVX512F-NEXT: movzbl %al, %ecx
+; AVX512F-NEXT: shrl %eax
+; AVX512F-NEXT: andl $85, %eax
+; AVX512F-NEXT: subl %eax, %ecx
+; AVX512F-NEXT: movl %ecx, %eax
; AVX512F-NEXT: andl $858993459, %eax ## imm = 0x33333333
-; AVX512F-NEXT: addl %ecx, %eax
-; AVX512F-NEXT: movl %eax, %ecx
-; AVX512F-NEXT: shrl $4, %ecx
+; AVX512F-NEXT: shrl $2, %ecx
+; AVX512F-NEXT: andl $858993459, %ecx ## imm = 0x33333333
; AVX512F-NEXT: addl %eax, %ecx
-; AVX512F-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F
-; AVX512F-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101
+; AVX512F-NEXT: movl %ecx, %eax
+; AVX512F-NEXT: shrl $4, %eax
+; AVX512F-NEXT: addl %ecx, %eax
+; AVX512F-NEXT: andl $252645135, %eax ## imm = 0xF0F0F0F
+; AVX512F-NEXT: imull $16843009, %eax, %eax ## imm = 0x1010101
; AVX512F-NEXT: shrl $24, %eax
; AVX512F-NEXT: kshiftrw $8, %k1, %k2
; AVX512F-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
; AVX512VLBW-NEXT: vpsllw $7, %xmm2, %xmm2
; AVX512VLBW-NEXT: vpmovb2m %xmm2, %k1
; AVX512VLBW-NEXT: kmovd %k1, %eax
-; AVX512VLBW-NEXT: movzbl %al, %eax
-; AVX512VLBW-NEXT: movl %eax, %ecx
-; AVX512VLBW-NEXT: shrl %ecx
-; AVX512VLBW-NEXT: andl $-43, %ecx
-; AVX512VLBW-NEXT: subl %ecx, %eax
-; AVX512VLBW-NEXT: movl %eax, %ecx
-; AVX512VLBW-NEXT: andl $858993459, %ecx ## imm = 0x33333333
-; AVX512VLBW-NEXT: shrl $2, %eax
+; AVX512VLBW-NEXT: movzbl %al, %ecx
+; AVX512VLBW-NEXT: shrl %eax
+; AVX512VLBW-NEXT: andl $85, %eax
+; AVX512VLBW-NEXT: subl %eax, %ecx
+; AVX512VLBW-NEXT: movl %ecx, %eax
; AVX512VLBW-NEXT: andl $858993459, %eax ## imm = 0x33333333
-; AVX512VLBW-NEXT: addl %ecx, %eax
-; AVX512VLBW-NEXT: movl %eax, %ecx
-; AVX512VLBW-NEXT: shrl $4, %ecx
+; AVX512VLBW-NEXT: shrl $2, %ecx
+; AVX512VLBW-NEXT: andl $858993459, %ecx ## imm = 0x33333333
; AVX512VLBW-NEXT: addl %eax, %ecx
-; AVX512VLBW-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F
-; AVX512VLBW-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101
+; AVX512VLBW-NEXT: movl %ecx, %eax
+; AVX512VLBW-NEXT: shrl $4, %eax
+; AVX512VLBW-NEXT: addl %ecx, %eax
+; AVX512VLBW-NEXT: andl $252645135, %eax ## imm = 0xF0F0F0F
+; AVX512VLBW-NEXT: imull $16843009, %eax, %eax ## imm = 0x1010101
; AVX512VLBW-NEXT: shrl $24, %eax
; AVX512VLBW-NEXT: kshiftrw $8, %k1, %k2
; AVX512VLBW-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000
; CHECK-NEXT: cmovll %ecx, %edx
; CHECK-NEXT: pextrw $1, %xmm0, %esi
-; CHECK-NEXT: movswl %si, %edi
-; CHECK-NEXT: leal (%rdi,%rdi), %eax
+; CHECK-NEXT: leal (%rsi,%rsi), %edi
+; CHECK-NEXT: movswl %si, %eax
; CHECK-NEXT: movl %eax, %esi
; CHECK-NEXT: shrl $16, %esi
-; CHECK-NEXT: shldw $1, %ax, %si
-; CHECK-NEXT: sarl $15, %edi
-; CHECK-NEXT: cmpl $16384, %edi # imm = 0x4000
+; CHECK-NEXT: shldw $1, %di, %si
+; CHECK-NEXT: sarl $16, %eax
+; CHECK-NEXT: cmpl $16384, %eax # imm = 0x4000
; CHECK-NEXT: cmovgel %r8d, %esi
-; CHECK-NEXT: cmpl $-16384, %edi # imm = 0xC000
+; CHECK-NEXT: cmpl $-16384, %eax # imm = 0xC000
; CHECK-NEXT: cmovll %ecx, %esi
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: cwtl