/// them) when they are deleted from the underlying DAG. It relies on
/// stable indices of nodes within the worklist.
DenseMap<SDNode *, unsigned> WorklistMap;
+ /// This records all nodes attempted to add to the worklist since we
+ /// considered a new worklist entry. As we keep do not add duplicate nodes
+ /// in the worklist, this is different from the tail of the worklist.
+ SmallSetVector<SDNode *, 32> PruningList;
/// Set of nodes which have been combined (at least once).
///
AddToWorklist(Node);
}
+ // Prune potentially dangling nodes. This is called after
+ // any visit to a node, but should also be called during a visit after any
+ // failed combine which may have created a DAG node.
+ void clearAddedDanglingWorklistEntries() {
+ // Check any nodes added to the worklist to see if they are prunable.
+ while (!PruningList.empty()) {
+ auto *N = PruningList.pop_back_val();
+ if (N->use_empty())
+ recursivelyDeleteUnusedNodes(N);
+ }
+ }
+
+ SDNode *getNextWorklistEntry() {
+ // Before we do any work, remove nodes that are not in use.
+ clearAddedDanglingWorklistEntries();
+ SDNode *N = nullptr;
+ // The Worklist holds the SDNodes in order, but it may contain null
+ // entries.
+ while (!N && !Worklist.empty()) {
+ N = Worklist.pop_back_val();
+ }
+
+ if (N) {
+ bool GoodWorklistEntry = WorklistMap.erase(N);
+ (void)GoodWorklistEntry;
+ assert(GoodWorklistEntry &&
+ "Found a worklist entry without a corresponding map entry!");
+ }
+ return N;
+ }
+
/// Call the node-specific routine that folds each particular type of node.
SDValue visit(SDNode *N);
MaximumLegalStoreInBits = VT.getSizeInBits();
}
+ void ConsiderForPruning(SDNode *N) {
+ // Mark this for potential pruning.
+ PruningList.insert(N);
+ }
+
/// Add to the worklist making sure its instance is at the back (next to be
/// processed.)
void AddToWorklist(SDNode *N) {
if (N->getOpcode() == ISD::HANDLENODE)
return;
+ ConsiderForPruning(N);
+
if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second)
Worklist.push_back(N);
}
/// Remove all instances of N from the worklist.
void removeFromWorklist(SDNode *N) {
CombinedNodes.erase(N);
+ PruningList.remove(N);
auto It = WorklistMap.find(N);
if (It == WorklistMap.end())
explicit WorklistInserter(DAGCombiner &dc)
: SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}
- // This should eventually be pruning.
- void NodeInserted(SDNode *N) override { }
+ // FIXME: Ideally we could add N to the worklist, but this causes exponential
+ // compile time costs in large DAGs, e.g. Halide.
+ void NodeInserted(SDNode *N) override { DC.ConsiderForPruning(N); }
};
} // end anonymous namespace
// changes of the root.
HandleSDNode Dummy(DAG.getRoot());
- // While the worklist isn't empty, find a node and try to combine it.
- while (!WorklistMap.empty()) {
- SDNode *N;
- // The Worklist holds the SDNodes in order, but it may contain null entries.
- do {
- N = Worklist.pop_back_val();
- } while (!N);
-
- bool GoodWorklistEntry = WorklistMap.erase(N);
- (void)GoodWorklistEntry;
- assert(GoodWorklistEntry &&
- "Found a worklist entry without a corresponding map entry!");
-
+ // While we have a valid worklist entry node, try to combine it.
+ while (SDNode *N = getNextWorklistEntry()) {
// If N has no uses, it is dead. Make sure to revisit all N's operands once
// N is deleted from the DAG, since they too may now be dead or may have a
// reduced number of uses, allowing other xforms.
; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000
; SI-NOT: and
; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000
; SI-NOT: and
; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000
; SI-NOT: and
; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000
; SI-NOT: and
; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64:
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0
; SI-NOT: and
; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64:
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0
; SI-NOT: and
; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000
; SI-NOT: and
; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000
; SI-NOT: and
; Shift into upper 32-bits
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
; SI-NOT: and
; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64:
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
; SI-NOT: and
}
; GCN-LABEL: {{^}}ps_mesa_inreg_v2i16:
-; VI: s_lshr_b32 s1, s0, 16
-; VI: s_add_i32 s1, s1, 1
+; VI: s_and_b32 s1, s0, 0xffff0000
; VI: s_add_i32 s0, s0, 1
+; VI: s_add_i32 s1, s1, 0x10000
; VI: s_and_b32 s0, s0, 0xffff
-; VI: s_lshl_b32 s1, s1, 16
; VI: s_or_b32 s0, s0, s1
; VI: v_mov_b32_e32 v0, s0
; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]]
; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
-; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], [[A]], -[[B]], -[[C]]
; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]]
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MAD]]
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX8-NEXT: s_mov_b32 s0, 0xffff
+; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s3, s1, s0
-; GFX8-NEXT: s_and_b32 s0, s2, s0
-; GFX8-NEXT: s_lshr_b32 s2, s2, 16
+; GFX8-NEXT: s_and_b32 s3, s1, s2
; GFX8-NEXT: s_lshr_b32 s1, s1, 16
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: s_and_b32 s2, s0, s2
+; GFX8-NEXT: s_lshr_b32 s0, s0, 16
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff
+; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
-; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16
+; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2
+; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
;
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX8-NEXT: s_mov_b32 s0, 0xffff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s5, s1, 8
-; GFX8-NEXT: s_lshr_b32 s6, s2, 8
-; GFX8-NEXT: s_sext_i32_i8 s4, s2
-; GFX8-NEXT: s_bfe_i32 s5, s5, 0x80000
-; GFX8-NEXT: s_bfe_i32 s6, s6, 0x80000
-; GFX8-NEXT: s_bfe_i32 s8, s2, 0x80010
-; GFX8-NEXT: s_lshr_b32 s2, s2, 24
-; GFX8-NEXT: v_mov_b32_e32 v3, s4
-; GFX8-NEXT: s_sext_i32_i8 s3, s1
-; GFX8-NEXT: s_bfe_i32 s7, s1, 0x80010
-; GFX8-NEXT: s_lshr_b32 s1, s1, 24
-; GFX8-NEXT: s_and_b32 s4, s0, s5
-; GFX8-NEXT: s_and_b32 s5, s0, s6
-; GFX8-NEXT: s_bfe_i32 s1, s1, 0x80000
-; GFX8-NEXT: s_bfe_i32 s2, s2, 0x80000
+; GFX8-NEXT: s_sext_i32_i8 s0, s2
+; GFX8-NEXT: s_sext_i32_i8 s1, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: s_bfe_i32 s4, s3, 0x80008
+; GFX8-NEXT: s_bfe_i32 s5, s3, 0x80010
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: s_bfe_i32 s1, s2, 0x80008
+; GFX8-NEXT: s_bfe_i32 s4, s2, 0x80010
+; GFX8-NEXT: s_ashr_i32 s3, s3, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: s_and_b32 s1, s0, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s8
-; GFX8-NEXT: s_and_b32 s0, s0, s2
+; GFX8-NEXT: s_ashr_i32 s2, s2, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s4, v5, v2
-; GFX8-NEXT: v_mad_i32_i24 v2, s7, v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s4, v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 8
-; GFX9-NODL-NEXT: s_lshr_b32 s6, s2, 8
-; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2
-; GFX9-NODL-NEXT: s_bfe_i32 s5, s5, 0x80000
-; GFX9-NODL-NEXT: s_bfe_i32 s6, s6, 0x80000
-; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010
-; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s1
-; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80010
-; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX9-NODL-NEXT: s_and_b32 s4, s0, s5
-; GFX9-NODL-NEXT: s_and_b32 s5, s0, s6
-; GFX9-NODL-NEXT: s_bfe_i32 s1, s1, 0x80000
-; GFX9-NODL-NEXT: s_bfe_i32 s2, s2, 0x80000
+; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s2
+; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s3
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NODL-NEXT: s_bfe_i32 s4, s3, 0x80008
+; GFX9-NODL-NEXT: s_bfe_i32 s5, s3, 0x80010
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NODL-NEXT: s_bfe_i32 s1, s2, 0x80008
+; GFX9-NODL-NEXT: s_bfe_i32 s4, s2, 0x80010
+; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NODL-NEXT: s_and_b32 s1, s0, s1
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-NODL-NEXT: s_and_b32 s0, s0, s2
+; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v5, v2
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s7, v4, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v5, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2
; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 8
-; GFX9-DL-NEXT: s_lshr_b32 s6, s2, 8
-; GFX9-DL-NEXT: s_sext_i32_i8 s4, s2
-; GFX9-DL-NEXT: s_bfe_i32 s5, s5, 0x80000
-; GFX9-DL-NEXT: s_bfe_i32 s6, s6, 0x80000
-; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x80010
-; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT: s_sext_i32_i8 s3, s1
-; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x80010
-; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX9-DL-NEXT: s_and_b32 s4, s0, s5
-; GFX9-DL-NEXT: s_and_b32 s5, s0, s6
-; GFX9-DL-NEXT: s_bfe_i32 s1, s1, 0x80000
-; GFX9-DL-NEXT: s_bfe_i32 s2, s2, 0x80000
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-DL-NEXT: s_and_b32 s1, s0, s1
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-DL-NEXT: s_and_b32 s0, s0, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v5, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v4, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
+; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v3, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008
+; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008
; GFX8-NEXT: s_and_b32 s3, s1, s2
-; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX8-NEXT: s_and_b32 s2, s0, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010
; GFX8-NEXT: s_lshr_b32 s1, s1, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_lshr_b32 s0, s0, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008
; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008
+; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008
; GFX8-NEXT: s_and_b32 s3, s1, s2
-; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX8-NEXT: s_and_b32 s2, s0, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010
; GFX8-NEXT: s_lshr_b32 s1, s1, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_lshr_b32 s0, s0, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008
; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: s_movk_i32 s2, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s3, s1, s2
-; GFX8-NEXT: s_and_b32 s2, s0, s2
+; GFX8-NEXT: s_and_b32 s3, s2, s0
+; GFX8-NEXT: s_and_b32 s0, s1, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80008
; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80008
-; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80008
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2
-; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2
+; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0
+; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NODL-NEXT: s_bfe_u32 s2, s2, 0x80008
; GFX9-NODL-NEXT: s_bfe_u32 s1, s1, 0x80008
-; GFX9-NODL-NEXT: s_bfe_u32 s0, s0, 0x80008
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT: s_movk_i32 s2, 0xff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_and_b32 s3, s1, s2
-; GFX9-DL-NEXT: s_and_b32 s2, s0, s2
+; GFX9-DL-NEXT: s_and_b32 s3, s2, s0
+; GFX9-DL-NEXT: s_and_b32 s0, s1, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x80008
; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x80008
-; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x80008
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s3, s1, s0
-; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX8-NEXT: s_and_b32 s0, s2, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX8-NEXT: s_lshr_b32 s1, s1, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_lshr_b32 s2, s2, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
+; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008
-; GFX8-NEXT: s_and_b32 s3, s2, s0
-; GFX8-NEXT: s_and_b32 s0, s1, s0
+; GFX8-NEXT: s_and_b32 s3, s1, s0
; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_and_b32 s0, s2, s0
+; GFX8-NEXT: v_mov_b32_e32 v4, s3
; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX8-NEXT: s_lshr_b32 s1, s1, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_lshr_b32 s2, s2, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s3, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s0, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
-; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0
-; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0
+; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v4, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v4, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x80008
-; GFX9-DL-NEXT: s_and_b32 s3, s2, s0
-; GFX9-DL-NEXT: s_and_b32 s0, s1, s0
+; GFX9-DL-NEXT: s_and_b32 s3, s1, s0
; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
; GFX9-DL-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-DL-NEXT: s_and_b32 s0, s2, s0
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3
; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-DL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v4, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v4, v2
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_bfe_i32 s3, s0, 0x80000
-; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80008
-; GFX8-NEXT: s_bfe_i32 s4, s1, 0x80000
-; GFX8-NEXT: s_and_b32 s3, s2, s3
-; GFX8-NEXT: s_and_b32 s2, s2, s4
-; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
-; GFX8-NEXT: s_bfe_u32 s8, s1, 0x80010
-; GFX8-NEXT: v_mov_b32_e32 v5, s2
-; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010
-; GFX8-NEXT: s_lshr_b32 s1, s1, 24
-; GFX8-NEXT: v_mov_b32_e32 v4, s8
-; GFX8-NEXT: s_lshr_b32 s0, s0, 24
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s3, v5, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s7, v4, v2
+; GFX8-NEXT: s_bfe_u32 s0, s2, 0x80008
+; GFX8-NEXT: s_bfe_u32 s1, s3, 0x80008
; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: s_sext_i32_i8 s4, s3
+; GFX8-NEXT: s_bfe_u32 s5, s3, 0x80010
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: s_sext_i32_i8 s1, s2
+; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80010
+; GFX8-NEXT: s_lshr_b32 s3, s3, 24
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: s_lshr_b32 s2, s2, 24
+; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s4, v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_bfe_i32 s3, s0, 0x80000
-; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80008
-; GFX9-NODL-NEXT: s_bfe_i32 s4, s1, 0x80000
-; GFX9-NODL-NEXT: s_and_b32 s3, s2, s3
-; GFX9-NODL-NEXT: s_and_b32 s2, s2, s4
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-NODL-NEXT: s_bfe_u32 s8, s1, 0x80010
-; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s2
-; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010
-; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v5, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v4, v2
+; GFX9-NODL-NEXT: s_bfe_u32 s0, s2, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s1, s3, 0x80008
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s3
+; GFX9-NODL-NEXT: s_bfe_u32 s5, s3, 0x80010
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s2
+; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80010
+; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24
+; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v5, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_bfe_i32 s3, s0, 0x80000
-; GFX9-DL-NEXT: s_bfe_u32 s6, s1, 0x80008
-; GFX9-DL-NEXT: s_bfe_i32 s4, s1, 0x80000
-; GFX9-DL-NEXT: s_and_b32 s3, s2, s3
-; GFX9-DL-NEXT: s_and_b32 s2, s2, s4
-; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x80008
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-DL-NEXT: s_bfe_u32 s8, s1, 0x80010
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s2
-; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x80010
-; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v5, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v4, v2
+; GFX9-DL-NEXT: s_bfe_u32 s0, s2, 0x80008
+; GFX9-DL-NEXT: s_bfe_u32 s1, s3, 0x80008
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-DL-NEXT: s_sext_i32_i8 s4, s3
+; GFX9-DL-NEXT: s_bfe_u32 s5, s3, 0x80010
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-DL-NEXT: s_sext_i32_i8 s1, s2
+; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x80010
+; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v4, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v5, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s0, s2, 4
-; GFX8-NEXT: s_lshr_b32 s1, s4, 4
-; GFX8-NEXT: s_bfe_i32 s5, s4, 0x40000
-; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s0
-; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s1
-; GFX8-NEXT: s_bfe_i32 s0, s4, 0x40008
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40000
+; GFX8-NEXT: s_bfe_i32 s0, s2, 0x40000
+; GFX8-NEXT: s_bfe_i32 s1, s4, 0x40000
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: s_bfe_i32 s5, s4, 0x40004
+; GFX8-NEXT: s_bfe_i32 s6, s4, 0x40008
; GFX8-NEXT: s_lshr_b32 s1, s2, 12
-; GFX8-NEXT: s_lshr_b32 s5, s4, 12
-; GFX8-NEXT: v_mov_b32_e32 v6, s0
-; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40008
-; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
-; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s1
-; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s5
-; GFX8-NEXT: v_mul_i32_i24_e32 v6, s7, v6
-; GFX8-NEXT: s_lshr_b32 s0, s2, 20
-; GFX8-NEXT: s_lshr_b32 s1, s4, 20
-; GFX8-NEXT: s_bfe_i32 s5, s4, 0x40010
-; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
-; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s0
-; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s1
-; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40010
-; GFX8-NEXT: v_mov_b32_e32 v13, s5
-; GFX8-NEXT: s_lshr_b32 s0, s2, 28
-; GFX8-NEXT: s_lshr_b32 s9, s4, 28
-; GFX8-NEXT: s_bfe_i32 s4, s4, 0x40018
-; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
-; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
-; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s0
-; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s9
-; GFX8-NEXT: s_bfe_i32 s2, s2, 0x40018
-; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
+; GFX8-NEXT: s_lshr_b32 s7, s4, 12
+; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40004
+; GFX8-NEXT: s_bfe_i32 s9, s2, 0x40008
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v7, s5
+; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s1
+; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s7
+; GFX8-NEXT: v_mul_i32_i24_e32 v4, s9, v4
+; GFX8-NEXT: s_bfe_i32 s10, s4, 0x40010
+; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
+; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
+; GFX8-NEXT: s_bfe_i32 s12, s4, 0x40014
+; GFX8-NEXT: s_bfe_i32 s11, s2, 0x40010
+; GFX8-NEXT: v_mov_b32_e32 v8, s10
+; GFX8-NEXT: s_bfe_i32 s14, s4, 0x40018
+; GFX8-NEXT: s_bfe_i32 s13, s2, 0x40014
+; GFX8-NEXT: v_mov_b32_e32 v9, s12
+; GFX8-NEXT: s_bfe_i32 s15, s2, 0x40018
+; GFX8-NEXT: s_ashr_i32 s4, s4, 28
+; GFX8-NEXT: v_mov_b32_e32 v10, s14
+; GFX8-NEXT: s_ashr_i32 s2, s2, 28
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v2, s6, v5, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2
-; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX8-NEXT: v_mad_u32_u24 v2, v7, v8, v2
-; GFX8-NEXT: v_mad_i32_i24 v2, s8, v13, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, v9, v10, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2
+; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, v11, v12, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s0, s2, 4
-; GFX9-NEXT: s_lshr_b32 s1, s4, 4
-; GFX9-NEXT: s_bfe_i32 s5, s4, 0x40000
-; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s0
-; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1
-; GFX9-NEXT: s_bfe_i32 s0, s4, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000
+; GFX9-NEXT: s_bfe_i32 s0, s2, 0x40000
+; GFX9-NEXT: s_bfe_i32 s1, s4, 0x40000
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: s_bfe_i32 s5, s4, 0x40004
+; GFX9-NEXT: s_bfe_i32 s6, s4, 0x40008
; GFX9-NEXT: s_lshr_b32 s1, s2, 12
-; GFX9-NEXT: s_lshr_b32 s5, s4, 12
-; GFX9-NEXT: v_mov_b32_e32 v6, s0
-; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40008
-; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3
-; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s1
-; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s5
-; GFX9-NEXT: v_mul_i32_i24_e32 v6, s7, v6
-; GFX9-NEXT: s_lshr_b32 s0, s2, 20
-; GFX9-NEXT: s_lshr_b32 s1, s4, 20
-; GFX9-NEXT: s_bfe_i32 s5, s4, 0x40010
-; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8
-; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s0
-; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s1
-; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v13, s5
-; GFX9-NEXT: s_lshr_b32 s0, s2, 28
-; GFX9-NEXT: s_lshr_b32 s9, s4, 28
-; GFX9-NEXT: s_bfe_i32 s4, s4, 0x40018
-; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9
-; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10
-; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s0
-; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s9
-; GFX9-NEXT: s_bfe_i32 s2, s2, 0x40018
-; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12
+; GFX9-NEXT: s_lshr_b32 s7, s4, 12
+; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004
+; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008
+; GFX9-NEXT: v_mov_b32_e32 v4, s6
+; GFX9-NEXT: v_mov_b32_e32 v7, s5
+; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s1
+; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s7
+; GFX9-NEXT: v_mul_i32_i24_e32 v4, s9, v4
+; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40010
+; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
+; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
+; GFX9-NEXT: s_bfe_i32 s12, s4, 0x40014
+; GFX9-NEXT: s_bfe_i32 s11, s2, 0x40010
+; GFX9-NEXT: v_mov_b32_e32 v8, s10
+; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40018
+; GFX9-NEXT: s_bfe_i32 s13, s2, 0x40014
+; GFX9-NEXT: v_mov_b32_e32 v9, s12
+; GFX9-NEXT: s_bfe_i32 s15, s2, 0x40018
+; GFX9-NEXT: s_ashr_i32 s4, s4, 28
+; GFX9-NEXT: v_mov_b32_e32 v10, s14
+; GFX9-NEXT: s_ashr_i32 s2, s2, 28
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_i32_i24 v2, s6, v5, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, v3, v4, v2
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NEXT: v_mad_u32_u24 v2, v7, v8, v2
-; GFX9-NEXT: v_mad_i32_i24 v2, s8, v13, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, v9, v10, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s8, v7, v2
+; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NEXT: v_mad_u32_u24 v2, v5, v6, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: v_mad_i32_i24 v2, s2, v3, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, v11, v12, v2
; GFX9-NEXT: global_store_short v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 4
-; GFX9-DL-NEXT: s_lshr_b32 s1, s4, 4
-; GFX9-DL-NEXT: s_bfe_i32 s5, s4, 0x40000
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1
-; GFX9-DL-NEXT: s_bfe_i32 s0, s4, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40000
+; GFX9-DL-NEXT: s_bfe_i32 s0, s2, 0x40000
+; GFX9-DL-NEXT: s_bfe_i32 s1, s4, 0x40000
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-DL-NEXT: s_bfe_i32 s5, s4, 0x40004
+; GFX9-DL-NEXT: s_bfe_i32 s6, s4, 0x40008
; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 12
-; GFX9-DL-NEXT: s_lshr_b32 s5, s4, 12
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s0
-; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x40008
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s1
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s5
-; GFX9-DL-NEXT: v_mul_i32_i24_e32 v6, s7, v6
-; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 20
-; GFX9-DL-NEXT: s_lshr_b32 s1, s4, 20
-; GFX9-DL-NEXT: s_bfe_i32 s5, s4, 0x40010
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s0
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s1
-; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v13, s5
-; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 28
-; GFX9-DL-NEXT: s_lshr_b32 s9, s4, 28
-; GFX9-DL-NEXT: s_bfe_i32 s4, s4, 0x40018
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s0
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s9
-; GFX9-DL-NEXT: s_bfe_i32 s2, s2, 0x40018
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12
+; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 12
+; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004
+; GFX9-DL-NEXT: s_bfe_i32 s9, s2, 0x40008
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s6
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s5
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s7
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, s9, v4
+; GFX9-DL-NEXT: s_bfe_i32 s10, s4, 0x40010
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
+; GFX9-DL-NEXT: s_bfe_i32 s12, s4, 0x40014
+; GFX9-DL-NEXT: s_bfe_i32 s11, s2, 0x40010
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s10
+; GFX9-DL-NEXT: s_bfe_i32 s14, s4, 0x40018
+; GFX9-DL-NEXT: s_bfe_i32 s13, s2, 0x40014
+; GFX9-DL-NEXT: v_mov_b32_e32 v9, s12
+; GFX9-DL-NEXT: s_bfe_i32 s15, s2, 0x40018
+; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28
+; GFX9-DL-NEXT: v_mov_b32_e32 v10, s14
+; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v5, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v7, v8, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v13, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v9, v10, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v7, v2
+; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, v5, v6, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v11, v12, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s7, s0, 4
-; GFX8-NEXT: s_lshr_b32 s11, s1, 4
-; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s7
-; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s11
-; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40000
-; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
+; GFX8-NEXT: s_lshr_b32 s4, s0, 12
+; GFX8-NEXT: s_bfe_i32 s7, s1, 0x40000
+; GFX8-NEXT: s_lshr_b32 s5, s1, 12
+; GFX8-NEXT: s_bfe_i32 s9, s1, 0x40004
+; GFX8-NEXT: s_bfe_i32 s11, s1, 0x40008
+; GFX8-NEXT: s_bfe_i32 s6, s0, 0x40000
+; GFX8-NEXT: v_mov_b32_e32 v6, s7
+; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s4
+; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s5
+; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40004
+; GFX8-NEXT: s_bfe_i32 s10, s0, 0x40008
+; GFX8-NEXT: v_mov_b32_e32 v3, s11
+; GFX8-NEXT: v_mov_b32_e32 v7, s9
; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX8-NEXT: s_lshr_b32 s6, s0, 12
-; GFX8-NEXT: s_lshr_b32 s10, s1, 12
-; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40008
-; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40000
-; GFX8-NEXT: v_mov_b32_e32 v12, s13
-; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s6
-; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s10
-; GFX8-NEXT: s_bfe_i32 s14, s0, 0x40008
-; GFX8-NEXT: v_mov_b32_e32 v5, s15
-; GFX8-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
+; GFX8-NEXT: v_mul_i32_i24_e32 v3, s10, v3
+; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40010
; GFX8-NEXT: v_and_b32_e32 v4, s2, v4
-; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
-; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX8-NEXT: s_lshr_b32 s5, s0, 20
-; GFX8-NEXT: s_lshr_b32 s9, s1, 20
-; GFX8-NEXT: v_mul_i32_i24_e32 v5, s14, v5
-; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s5
-; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s9
-; GFX8-NEXT: s_bfe_i32 s17, s1, 0x40010
-; GFX8-NEXT: v_and_b32_e32 v6, s2, v6
-; GFX8-NEXT: v_and_b32_e32 v7, s2, v7
-; GFX8-NEXT: s_lshr_b32 s8, s1, 28
-; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
-; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
-; GFX8-NEXT: s_lshr_b32 s4, s0, 28
-; GFX8-NEXT: s_bfe_i32 s16, s0, 0x40010
-; GFX8-NEXT: v_mov_b32_e32 v13, s17
-; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s4
-; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s8
-; GFX8-NEXT: s_bfe_i32 s1, s1, 0x40018
-; GFX8-NEXT: v_and_b32_e32 v8, s2, v8
-; GFX8-NEXT: v_and_b32_e32 v9, s2, v9
-; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
-; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX8-NEXT: s_bfe_i32 s0, s0, 0x40018
-; GFX8-NEXT: v_and_b32_e32 v10, s2, v10
-; GFX8-NEXT: v_and_b32_e32 v11, s2, v11
+; GFX8-NEXT: v_and_b32_e32 v5, s2, v5
+; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40014
+; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40010
+; GFX8-NEXT: v_mov_b32_e32 v8, s13
+; GFX8-NEXT: s_bfe_i32 s17, s1, 0x40018
+; GFX8-NEXT: s_bfe_i32 s14, s0, 0x40014
+; GFX8-NEXT: v_mov_b32_e32 v9, s15
+; GFX8-NEXT: s_bfe_i32 s16, s0, 0x40018
+; GFX8-NEXT: s_ashr_i32 s1, s1, 28
+; GFX8-NEXT: v_mov_b32_e32 v10, s17
+; GFX8-NEXT: s_ashr_i32 s0, s0, 28
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v2, s12, v12, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2
-; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX8-NEXT: v_mad_u32_u24 v2, v6, v7, v2
-; GFX8-NEXT: v_mad_i32_i24 v2, s16, v13, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, v8, v9, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s6, v6, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2
+; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s12, v8, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s14, v9, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s16, v10, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, v10, v11, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s7, s0, 4
-; GFX9-NEXT: s_lshr_b32 s11, s1, 4
-; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s7
-; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s11
-; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40000
-; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3
+; GFX9-NEXT: s_lshr_b32 s4, s0, 12
+; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40000
+; GFX9-NEXT: s_lshr_b32 s5, s1, 12
+; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40004
+; GFX9-NEXT: s_bfe_i32 s11, s1, 0x40008
+; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40000
+; GFX9-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s4
+; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s5
+; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40004
+; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40008
+; GFX9-NEXT: v_mov_b32_e32 v3, s11
+; GFX9-NEXT: v_mov_b32_e32 v7, s9
; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-NEXT: s_lshr_b32 s6, s0, 12
-; GFX9-NEXT: s_lshr_b32 s10, s1, 12
-; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40008
-; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40000
-; GFX9-NEXT: v_mov_b32_e32 v12, s13
-; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s6
-; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s10
-; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v5, s15
-; GFX9-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
+; GFX9-NEXT: v_mul_i32_i24_e32 v3, s10, v3
+; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40010
; GFX9-NEXT: v_and_b32_e32 v4, s2, v4
-; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX9-NEXT: s_lshr_b32 s5, s0, 20
-; GFX9-NEXT: s_lshr_b32 s9, s1, 20
-; GFX9-NEXT: v_mul_i32_i24_e32 v5, s14, v5
-; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s5
-; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s9
-; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40010
-; GFX9-NEXT: v_and_b32_e32 v6, s2, v6
-; GFX9-NEXT: v_and_b32_e32 v7, s2, v7
-; GFX9-NEXT: s_lshr_b32 s8, s1, 28
-; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8
-; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9
-; GFX9-NEXT: s_lshr_b32 s4, s0, 28
-; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v13, s17
-; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s4
-; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s8
-; GFX9-NEXT: s_bfe_i32 s1, s1, 0x40018
-; GFX9-NEXT: v_and_b32_e32 v8, s2, v8
-; GFX9-NEXT: v_and_b32_e32 v9, s2, v9
-; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10
-; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-NEXT: s_bfe_i32 s0, s0, 0x40018
-; GFX9-NEXT: v_and_b32_e32 v10, s2, v10
-; GFX9-NEXT: v_and_b32_e32 v11, s2, v11
+; GFX9-NEXT: v_and_b32_e32 v5, s2, v5
+; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40014
+; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010
+; GFX9-NEXT: v_mov_b32_e32 v8, s13
+; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40018
+; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014
+; GFX9-NEXT: v_mov_b32_e32 v9, s15
+; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018
+; GFX9-NEXT: s_ashr_i32 s1, s1, 28
+; GFX9-NEXT: v_mov_b32_e32 v10, s17
+; GFX9-NEXT: s_ashr_i32 s0, s0, 28
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_i32_i24 v2, s12, v12, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, v3, v4, v2
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NEXT: v_mad_u32_u24 v2, v6, v7, v2
-; GFX9-NEXT: v_mad_i32_i24 v2, s16, v13, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, v8, v9, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s6, v6, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s8, v7, v2
+; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s12, v8, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s14, v9, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s16, v10, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, v10, v11, v2
; GFX9-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 4
-; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 4
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s7
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s11
-; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40000
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3
+; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 12
+; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40000
+; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 12
+; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40004
+; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x40008
+; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40000
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s5
+; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40004
+; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x40008
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 12
-; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 12
-; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40008
-; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40000
-; GFX9-DL-NEXT: v_mov_b32_e32 v12, s13
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s6
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s10
-; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15
-; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s10, v3
+; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40010
; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 20
-; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 20
-; GFX9-DL-NEXT: v_mul_i32_i24_e32 v5, s14, v5
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s5
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s9
-; GFX9-DL-NEXT: s_bfe_i32 s17, s1, 0x40010
-; GFX9-DL-NEXT: v_and_b32_e32 v6, s2, v6
-; GFX9-DL-NEXT: v_and_b32_e32 v7, s2, v7
-; GFX9-DL-NEXT: s_lshr_b32 s8, s1, 28
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9
-; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28
-; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v13, s17
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s8
-; GFX9-DL-NEXT: s_bfe_i32 s1, s1, 0x40018
-; GFX9-DL-NEXT: v_and_b32_e32 v8, s2, v8
-; GFX9-DL-NEXT: v_and_b32_e32 v9, s2, v9
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-DL-NEXT: s_bfe_i32 s0, s0, 0x40018
-; GFX9-DL-NEXT: v_and_b32_e32 v10, s2, v10
-; GFX9-DL-NEXT: v_and_b32_e32 v11, s2, v11
+; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5
+; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40014
+; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40010
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s13
+; GFX9-DL-NEXT: s_bfe_i32 s17, s1, 0x40018
+; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40014
+; GFX9-DL-NEXT: v_mov_b32_e32 v9, s15
+; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40018
+; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28
+; GFX9-DL-NEXT: v_mov_b32_e32 v10, s17
+; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v12, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v6, v7, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v13, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v8, v9, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v6, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v7, v2
+; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v8, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v9, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v10, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v10, v11, v2
; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004
; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008
-; GFX8-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40008
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004
-; GFX8-NEXT: v_mul_u32_u24_e32 v4, s7, v4
-; GFX8-NEXT: s_bfe_u32 s5, s4, 0x4000c
-; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX8-NEXT: s_bfe_u32 s7, s4, 0x40010
-; GFX8-NEXT: v_mov_b32_e32 v6, s5
-; GFX8-NEXT: s_bfe_u32 s6, s2, 0x4000c
-; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40014
-; GFX8-NEXT: v_mov_b32_e32 v7, s7
-; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40010
-; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40018
-; GFX8-NEXT: v_mov_b32_e32 v8, s8
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40014
-; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40018
+; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008
+; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5
+; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010
+; GFX8-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014
+; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX8-NEXT: v_mov_b32_e32 v7, s9
+; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018
+; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014
+; GFX8-NEXT: v_mov_b32_e32 v8, s11
+; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018
; GFX8-NEXT: s_lshr_b32 s4, s4, 28
-; GFX8-NEXT: v_mov_b32_e32 v9, s9
+; GFX8-NEXT: v_mov_b32_e32 v9, s13
; GFX8-NEXT: s_lshr_b32 s2, s2, 28
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s1, v5, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s7, v8, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s8, v9, v2
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004
; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v4, s6
-; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c
+; GFX9-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004
-; GFX9-NEXT: v_mul_u32_u24_e32 v4, s7, v4
-; GFX9-NEXT: s_bfe_u32 s5, s4, 0x4000c
-; GFX9-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v6, s5
-; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40014
-; GFX9-NEXT: v_mov_b32_e32 v7, s7
-; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010
-; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40018
-; GFX9-NEXT: v_mov_b32_e32 v8, s8
-; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40014
-; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40018
+; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008
+; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5
+; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010
+; GFX9-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014
+; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX9-NEXT: v_mov_b32_e32 v7, s9
+; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018
+; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014
+; GFX9-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018
; GFX9-NEXT: s_lshr_b32 s4, s4, 28
-; GFX9-NEXT: v_mov_b32_e32 v9, s9
+; GFX9-NEXT: v_mov_b32_e32 v9, s13
; GFX9-NEXT: s_lshr_b32 s2, s2, 28
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s1, v5, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s7, v8, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s8, v9, v2
+; GFX9-NEXT: v_add_u32_e32 v2, v2, v5
+; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004
; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s6
-; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5
; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, s7, v4
-; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x4000c
-; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s5
-; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x40014
-; GFX9-DL-NEXT: v_mov_b32_e32 v7, s7
-; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40018
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8
-; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5
+; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010
+; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9
+; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018
; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28
-; GFX9-DL-NEXT: v_mov_b32_e32 v9, s9
+; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13
; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v5, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2
; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v8, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v9, v2
+; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004
; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008
+; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004
-; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008
-; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010
-; GFX8-NEXT: v_mov_b32_e32 v6, s7
-; GFX8-NEXT: s_bfe_u32 s6, s2, 0x4000c
-; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40014
-; GFX8-NEXT: v_mov_b32_e32 v7, s8
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010
-; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40018
-; GFX8-NEXT: v_mov_b32_e32 v8, s9
-; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014
-; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40018
+; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5
+; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010
+; GFX8-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014
+; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX8-NEXT: v_mov_b32_e32 v7, s9
+; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018
+; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014
+; GFX8-NEXT: v_mov_b32_e32 v8, s11
+; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018
; GFX8-NEXT: s_lshr_b32 s4, s4, 28
-; GFX8-NEXT: v_mov_b32_e32 v9, s10
+; GFX8-NEXT: v_mov_b32_e32 v9, s13
; GFX8-NEXT: s_lshr_b32 s2, s2, 28
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s5, v5, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s8, v8, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s9, v9, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
+; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004
; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008
+; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c
; GFX9-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004
-; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c
-; GFX9-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008
-; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v6, s7
-; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40014
-; GFX9-NEXT: v_mov_b32_e32 v7, s8
-; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010
-; GFX9-NEXT: s_bfe_u32 s10, s4, 0x40018
-; GFX9-NEXT: v_mov_b32_e32 v8, s9
-; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014
-; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40018
+; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5
+; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010
+; GFX9-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014
+; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX9-NEXT: v_mov_b32_e32 v7, s9
+; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018
+; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014
+; GFX9-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018
; GFX9-NEXT: s_lshr_b32 s4, s4, 28
-; GFX9-NEXT: v_mov_b32_e32 v9, s10
+; GFX9-NEXT: v_mov_b32_e32 v9, s13
; GFX9-NEXT: s_lshr_b32 s2, s2, 28
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s5, v5, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s8, v8, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s9, v9, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2
+; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004
; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c
; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5
; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004
-; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008
-; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7
-; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40014
-; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8
-; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s10, s4, 0x40018
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s9
-; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5
+; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010
+; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9
+; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018
; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28
-; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10
+; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13
; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v5, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v8, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v9, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2
+; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004
; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008
-; GFX8-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40008
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004
-; GFX8-NEXT: v_mul_u32_u24_e32 v4, s7, v4
-; GFX8-NEXT: s_bfe_u32 s5, s4, 0x4000c
-; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX8-NEXT: s_bfe_u32 s7, s4, 0x40010
-; GFX8-NEXT: v_mov_b32_e32 v6, s5
-; GFX8-NEXT: s_bfe_u32 s6, s2, 0x4000c
-; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40014
-; GFX8-NEXT: v_mov_b32_e32 v7, s7
-; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40010
-; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40018
-; GFX8-NEXT: v_mov_b32_e32 v8, s8
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40014
-; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40018
+; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008
+; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5
+; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010
+; GFX8-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014
+; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX8-NEXT: v_mov_b32_e32 v7, s9
+; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018
+; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014
+; GFX8-NEXT: v_mov_b32_e32 v8, s11
+; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018
; GFX8-NEXT: s_lshr_b32 s4, s4, 28
-; GFX8-NEXT: v_mov_b32_e32 v9, s9
+; GFX8-NEXT: v_mov_b32_e32 v9, s13
; GFX8-NEXT: s_lshr_b32 s2, s2, 28
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s1, v5, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s7, v8, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s8, v9, v2
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004
; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v4, s6
-; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c
+; GFX9-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004
-; GFX9-NEXT: v_mul_u32_u24_e32 v4, s7, v4
-; GFX9-NEXT: s_bfe_u32 s5, s4, 0x4000c
-; GFX9-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v6, s5
-; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40014
-; GFX9-NEXT: v_mov_b32_e32 v7, s7
-; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010
-; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40018
-; GFX9-NEXT: v_mov_b32_e32 v8, s8
-; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40014
-; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40018
+; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008
+; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5
+; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010
+; GFX9-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014
+; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX9-NEXT: v_mov_b32_e32 v7, s9
+; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018
+; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014
+; GFX9-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018
; GFX9-NEXT: s_lshr_b32 s4, s4, 28
-; GFX9-NEXT: v_mov_b32_e32 v9, s9
+; GFX9-NEXT: v_mov_b32_e32 v9, s13
; GFX9-NEXT: s_lshr_b32 s2, s2, 28
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s1, v5, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s7, v8, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s8, v9, v2
+; GFX9-NEXT: v_add_u32_e32 v2, v2, v5
+; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004
; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s6
-; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5
; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, s7, v4
-; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x4000c
-; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s5
-; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x40014
-; GFX9-DL-NEXT: v_mov_b32_e32 v7, s7
-; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40018
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8
-; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5
+; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010
+; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9
+; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018
; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28
-; GFX9-DL-NEXT: v_mov_b32_e32 v9, s9
+; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13
; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v5, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2
; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v8, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v9, v2
+; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC --check-prefix=GCN
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN
-; RUN: llc < %s -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN1
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN2
+; RUN: llc < %s -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN2
declare i32 @llvm.r600.read.tidig.x() nounwind readnone
; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
; EG: 16
; FIXME: Should be using scalar instructions here.
-; GCN: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; GCN: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16
+; GCN1: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; GCN1: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16
+; GCN2: s_mul_i32 [[MUL:s[0-9]]], {{[s][0-9], [s][0-9]}}
+; GCN2: s_add_i32 [[MAD:s[0-9]]], [[MUL]], s{{[0-9]}}
+; GCN2: s_sext_i32_i16 s0, [[MAD]]
+; GCN2: v_mov_b32_e32 v0, s0
define amdgpu_kernel void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
entry:
%0 = mul i16 %a, %b
; The result must be sign-extended
; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
; EG: 8
-; GCN: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
+; GCN1: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; GCN1: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
+; GCN2: s_mul_i32 [[MUL:s[0-9]]], {{[s][0-9], [s][0-9]}}
+; GCN2: s_add_i32 [[MAD:s[0-9]]], [[MUL]], s{{[0-9]}}
+; GCN2: s_sext_i32_i8 s0, [[MAD]]
+; GCN2: v_mov_b32_e32 v0, s0
define amdgpu_kernel void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
entry:
%0 = mul i8 %a, %b
}
; GCN-LABEL: {{^}}and_not_mask_i64:
-; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
+; GCN-DAG: buffer_load_dword v[[VAL:[0-9]+]]
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[SHRHI:[0-9]+]], v[[ZERO]]{{$}}
-; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VALLO]]
+; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VAL]]
; GCN-DAG: v_and_b32_e32 v[[SHRLO:[0-9]+]], 4, [[SHR]]
; GCN-NOT: v[[SHRLO]]
; GCN-NOT: v[[SHRHI]]
; after 64-bit shift is split.
; GCN-LABEL: {{^}}lshr_and_i64_35:
-; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
-; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[HI]], 8, 23
; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_load_dword v[[LO:[0-9]+]]
+; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[LO]], 8, 23
; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
define amdgpu_kernel void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
%val = load i64, i64 addrspace(1)* %in
ret void
}
+; FIXME: This or should fold into an offset on the write
; GCN-LABEL: {{^}}shl_or_ptr_combine_2use_lds:
; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
-; GCN: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:32
+; GCN: v_or_b32_e32 [[SCALE1:v[0-9]+]], 32, [[SCALE0]]
+; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}}
; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0
; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}} offset:64
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_addk_i32 s0, 0x3e7
; VI-NEXT: s_or_b32 s0, s0, 4
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_addk_i32 s0, 0x3e7
; VI-NEXT: s_or_b32 s0, s0, 4
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_addk_i32 s0, 0x3e7
; VI-NEXT: s_or_b32 s0, s0, 1
; VI-NEXT: v_mov_b32_e32 v2, s0
}
; CHECK-LABEL: unsafe_add_underflow:
-; CHECK: subs r0, #2
-; CHECK: uxtb [[EXT:r[0-9]+]], r0
-; CHECK: cmp [[EXT]], #255
-; CHECK: moveq r0, #8
+; CHECK: movs r1, #16
+; CHECK: cmp r0, #1
+; CHECK: it eq
+; CHECK: moveq r1, #8
+; CHECK: mov r0, r1
define i32 @unsafe_add_underflow(i8 zeroext %a) {
%add = add i8 %a, -2
%cmp = icmp ugt i8 %add, 254
; CHECK-LABEL: check_f32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: vdup.32 d16, d17[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
; CHECK-LABEL: check_i32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: vdup.32 d16, d17[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
define <4 x i16> @check_i16(<8 x i16> %v) nounwind {
; CHECK-LABEL: check_i16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d17, r2, r3
; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: vdup.16 d16, d16[3]
; CHECK-NEXT: vmov r0, r1, d16
define <8 x i8> @check_i8(<16 x i8> %v) nounwind {
; CHECK-LABEL: check_i8:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d17, r2, r3
; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: vdup.8 d16, d16[3]
; CHECK-NEXT: vmov r0, r1, d16
define void @pr39478(i64* %p64, i32* %p32) {
; CHECKLE-LABEL: pr39478:
; CHECKLE: # %bb.0: # %entry
-; CHECKLE-NEXT: lwz 3, 4(3)
+; CHECKLE-NEXT: lbz 3, 4(3)
; CHECKLE-NEXT: stb 3, 0(4)
; CHECKLE-NEXT: blr
;
; CHECKBE-LABEL: pr39478:
; CHECKBE: # %bb.0: # %entry
-; CHECKBE-NEXT: lwz 3, 0(3)
+; CHECKBE-NEXT: lbz 3, 3(3)
; CHECKBE-NEXT: stb 3, 3(4)
; CHECKBE-NEXT: blr
entry:
; CHECK-NEXT: blr
; CHECK-BE-LABEL: test_igesll_sext_z:
; CHECK-BE: # %bb.0: # %entry
-; CHECK-BE-NEXT: sradi r3, r3, 63
; CHECK-BE-NEXT: not r3, r3
+; CHECK-BE-NEXT: sradi r3, r3, 63
; CHECK-BE-NEXT: blr
;
; CHECK-LE-LABEL: test_igesll_sext_z:
; CHECK-LE: # %bb.0: # %entry
-; CHECK-LE-NEXT: sradi r3, r3, 63
; CHECK-LE-NEXT: not r3, r3
+; CHECK-LE-NEXT: sradi r3, r3, 63
; CHECK-LE-NEXT: blr
entry:
%cmp = icmp sgt i64 %a, -1
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
-define void @PR22524({ float, float }* %arg) {
-; Check that we can materialize the zero constants we store in two places here,
-; and at least form a legal store of the floating point value at the end.
-; The DAG combiner at one point contained bugs that given enough permutations
-; would incorrectly form an illegal operation for the last of these stores when
-; it folded it to a zero too late to legalize the zero store operation. If this
-; ever starts forming a zero store instead of movss, the test case has stopped
-; being useful.
-;
-; CHECK-LABEL: PR22524:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movd %eax, %xmm0
-; CHECK-NEXT: xorps %xmm1, %xmm1
-; CHECK-NEXT: mulss %xmm0, %xmm1
-; CHECK-NEXT: movl $0, (%rdi)
-; CHECK-NEXT: movss %xmm1, 4(%rdi)
-; CHECK-NEXT: retq
-entry:
- %0 = getelementptr inbounds { float, float }, { float, float }* %arg, i32 0, i32 1
- store float 0.000000e+00, float* %0, align 4
- %1 = getelementptr inbounds { float, float }, { float, float }* %arg, i64 0, i32 0
- %2 = bitcast float* %1 to i64*
- %3 = load i64, i64* %2, align 8
- %4 = trunc i64 %3 to i32
- %5 = lshr i64 %3, 32
- %6 = trunc i64 %5 to i32
- %7 = bitcast i32 %6 to float
- %8 = fmul float %7, 0.000000e+00
- %9 = bitcast float* %1 to i32*
- store i32 %6, i32* %9, align 4
- store float %8, float* %0, align 4
- ret void
-}
-
-
define void @bitstore_fold() {
; CHECK-LABEL: bitstore_fold:
; CHECK: # %bb.0: # %BB
; CHECK-LABEL: extsetcc:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpnleps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vextractps $0, %xmm0, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: movb %al, (%rax)
+; CHECK-NEXT: vucomiss %xmm1, %xmm0
+; CHECK-NEXT: setb (%rax)
; CHECK-NEXT: retq
%cmp = fcmp ult <4 x float> %x, zeroinitializer
%sext = sext <4 x i1> %cmp to <4 x i32>
; CHECK-NEXT: movzwl (%eax), %eax
; CHECK-NEXT: movzwl %ax, %eax
; CHECK-NEXT: imull $52429, %eax, %ecx # imm = 0xCCCD
-; CHECK-NEXT: shrl $19, %ecx
-; CHECK-NEXT: addl %ecx, %ecx
+; CHECK-NEXT: shrl $18, %ecx
+; CHECK-NEXT: andl $-2, %ecx
; CHECK-NEXT: leal (%ecx,%ecx,4), %ecx
; CHECK-NEXT: cmpw %cx, %ax
; CHECK-NEXT: jne .LBB12_5
; CHECK-NEXT: subl $16, %esp
; CHECK-NEXT: movl $1, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: orl $0, %eax
-; CHECK-NEXT: je .LBB5_3
+; CHECK-NEXT: movb $1, %al
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: jne .LBB5_3
; CHECK-NEXT: # %bb.1: # %if.then
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: jmp .LBB5_2
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vpslld $24, %ymm0, %ymm0
; SKX_32-NEXT: vpsrad $24, %ymm0, %ymm1
-; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
; SKX_32-NEXT: retl
}
; PR11570
-; FIXME: This should also use movmskps; we don't form the FGETSIGN node
-; in this case, though.
define void @float_call_signbit(double %n) {
; CHECK-LABEL: float_call_signbit:
; CHECK: ## %bb.0: ## %entry
-; CHECK-NEXT: movq %xmm0, %rdi
-; CHECK-NEXT: shrq $63, %rdi
-; CHECK-NEXT: ## kill: def $edi killed $edi killed $rdi
+; CHECK-NEXT: movmskpd %xmm0, %edi
+; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: jmp _float_call_signbit_callee ## TAILCALL
entry:
%t0 = bitcast double %n to i64
define <4 x i32> @shrink_xor_constant1_splat(<4 x i32> %x) {
; ALL-LABEL: shrink_xor_constant1_splat:
; ALL: # %bb.0:
+; ALL-NEXT: pcmpeqd %xmm1, %xmm1
+; ALL-NEXT: pxor %xmm1, %xmm0
; ALL-NEXT: psrld $31, %xmm0
-; ALL-NEXT: pxor {{.*}}(%rip), %xmm0
; ALL-NEXT: retq
%sh = lshr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
%not = xor <4 x i32> %sh, <i32 -1, i32 -1, i32 -1, i32 -1>
define void @PR41097() {
; SSE2-LABEL: PR41097:
; SSE2: # %bb.0:
-; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
+++ /dev/null
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
-
-; The test case is rather involved, because we need to get to a state where
-; We have a sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0) combine,
-; BUT this combine is only triggered post-legalization, so the setcc's return
-; type is i8. So we can't have the combine opportunity be exposed too early.
-; Basically, what we want to see is that the compare result zero-extended, and
-; then stored. Only one zext, and no sexts.
-
-; CHECK-LABEL: main:
-; CHECK: movzbl (%rdi), %[[EAX:.*]]
-; CHECK-NEXT: xorl %e[[C:.]]x, %e[[C]]x
-; CHECK-NEXT: cmpl $1, %[[EAX]]
-; CHECK-NEXT: sete %[[C]]l
-; CHECK-NEXT: movl %e[[C]]x, (%rsi)
-define void @main(i8* %p, i32* %q) {
-bb:
- %tmp4 = load i8, i8* %p, align 1
- %tmp5 = sext i8 %tmp4 to i32
- %tmp6 = load i8, i8* %p, align 1
- %tmp7 = zext i8 %tmp6 to i32
- %tmp8 = sub nsw i32 %tmp5, %tmp7
- %tmp11 = icmp eq i32 %tmp7, 1
- %tmp12 = zext i1 %tmp11 to i32
- %tmp13 = add nsw i32 %tmp8, %tmp12
- %tmp14 = trunc i32 %tmp13 to i8
- %tmp15 = sext i8 %tmp14 to i16
- %tmp16 = sext i16 %tmp15 to i32
- store i32 %tmp16, i32* %q, align 4
- br i1 %tmp11, label %bb21, label %bb22
-
-bb21: ; preds = %bb
- unreachable
-
-bb22: ; preds = %bb
- ret void
-}
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -o - %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@global = external global i32
-@global.1 = external global i64
-
-define void @patatino() {
-; CHECK-LABEL: patatino:
-; CHECK: # %bb.0: # %bb
-; CHECK-NEXT: movl {{.*}}(%rip), %eax
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl $31, %ecx
-; CHECK-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF
-; CHECK-NEXT: shrl $31, %ecx
-; CHECK-NEXT: andl $-2, %ecx
-; CHECK-NEXT: andl $-536870912, %eax # imm = 0xE0000000
-; CHECK-NEXT: orl %ecx, %eax
-; CHECK-NEXT: movl %eax, {{.*}}(%rip)
-; CHECK-NEXT: retq
-bb:
- %tmp = load i32, i32* @global
- %tmp1 = lshr i32 %tmp, 31
- %tmp2 = add nuw nsw i32 %tmp1, 2147483647
- %tmp3 = load i64, i64* @global.1
- %tmp4 = shl i64 %tmp3, 23
- %tmp5 = add nsw i64 %tmp4, 8388639
- %tmp6 = trunc i64 %tmp5 to i32
- %tmp7 = lshr i32 %tmp2, %tmp6
- %tmp8 = load i32, i32* @global
- %tmp9 = and i32 %tmp7, 62
- %tmp10 = and i32 %tmp8, -536870912
- %tmp11 = or i32 %tmp9, %tmp10
- store i32 %tmp11, i32* @global
- ret void
-}
define <4 x i32> @t17() nounwind {
; X86-LABEL: t17:
; X86: # %bb.0: # %entry
-; X86-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
-; X86-NEXT: andpd {{\.LCPI.*}}, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
+; X86-NEXT: pand {{\.LCPI.*}}, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: t17:
; X64: # %bb.0: # %entry
-; X64-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
-; X64-NEXT: andpd {{.*}}(%rip), %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
+; X64-NEXT: pand {{.*}}(%rip), %xmm0
; X64-NEXT: retq
entry:
%tmp1 = load <4 x float>, <4 x float>* undef, align 16
define i32 @test0(<1 x i64>* %v4) nounwind {
; X32-LABEL: test0:
; X32: # %bb.0: # %entry
-; X32-NEXT: pushl %ebp
-; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: andl $-8, %esp
-; X32-NEXT: subl $8, %esp
-; X32-NEXT: movl 8(%ebp), %eax
-; X32-NEXT: movl (%eax), %ecx
-; X32-NEXT: movl 4(%eax), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X32-NEXT: movl %ecx, (%esp)
-; X32-NEXT: pshufw $238, (%esp), %mm0 # mm0 = mem[2,3,2,3]
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: pshufw $238, (%eax), %mm0 # mm0 = mem[2,3,2,3]
; X32-NEXT: movd %mm0, %eax
; X32-NEXT: addl $32, %eax
-; X32-NEXT: movl %ebp, %esp
-; X32-NEXT: popl %ebp
; X32-NEXT: retl
;
; X64-LABEL: test0:
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,4]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,4]
+; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9]
; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX512VL-SLOW-NEXT: retq
;
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,5,6]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,5,6]
+; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1]
; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX512VL-SLOW-NEXT: retq
;
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,4]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,4]
+; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1]
; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX512VL-SLOW-NEXT: retq
;
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,4,4]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,4,4]
+; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX512VL-SLOW-NEXT: retq
;
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX512VL-SLOW-NEXT: retq
;
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX512VL-SLOW-NEXT: retq
;
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,3]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,0,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX512VL-SLOW-NEXT: retq
;
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,0,0,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,3]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,0,0,0,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX512VL-SLOW-NEXT: retq
;
; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: PR34369:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7],ymm0[8,9,10],ymm2[11],ymm0[12,13,14],ymm2[15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: PR34369:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,4,5,10,11,8,9,10,11,4,5,4,5]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7],ymm0[8,9,10],ymm2[11],ymm0[12,13,14],ymm2[15]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: PR34369:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,4,5,10,11,8,9,10,11,4,5,4,5]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7],ymm0[8,9,10],ymm2[11],ymm0[12,13,14],ymm2[15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
;
; AVX512VL-LABEL: PR34369:
; AVX512VL: # %bb.0:
; CHECK-NEXT: movl %esp, %ebp
; CHECK-NEXT: andl $-8, %esp
; CHECK-NEXT: subl $32, %esp
-; CHECK-NEXT: movl {{\.LCPI.*}}, %eax
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
; CHECK-NEXT: movw $1, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl $65537, {{[0-9]+}}(%esp) # imm = 0x10001
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp)
; CHECK-NEXT: jmp .LBB0_1
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_2: # %forbody