TargetLowering::Legal)
return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
N0.getOperand(0), DAG.getValueType(ExtVT));
+
// Even if we can't convert to sext_inreg, we might be able to remove
// this shift pair if the input is already sign extended.
if (DAG.ComputeNumSignBits(N0.getOperand(0)) > N1C->getZExtValue())
return N0.getOperand(0);
}
+ // fold (sra (or (shl x, c1), (shl y, c2)), c1)
+ // -> (sext_inreg (or x, (shl y,c2-c1))
+ // for some c1 and target supports sext_inreg.
+ if (N1C && N0.getOpcode() == ISD::OR &&
+ N0.getOperand(0).getOpcode() == ISD::SHL &&
+ N0.getOperand(1).getOpcode() == ISD::SHL &&
+ (N1 == N0.getOperand(0).getOperand(1) ||
+ N1 == N0.getOperand(1).getOperand(1)) &&
+ N0->hasOneUse() && N0.getOperand(0)->hasOneUse() &&
+ N0.getOperand(1)->hasOneUse()) {
+ ConstantSDNode *N00C = isConstOrConstSplat(N0.getOperand(0).getOperand(1));
+ ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1).getOperand(1));
+ if (N00C && N01C &&
+ N00C->getAPIntValue().uge(N1C->getZExtValue()) &&
+ N01C->getAPIntValue().uge(N1C->getZExtValue())) {
+ unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue();
+ EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits);
+ if (VT.isVector())
+ ExtVT = EVT::getVectorVT(*DAG.getContext(), ExtVT,
+ VT.getVectorElementCount());
+ if (!LegalOperations ||
+ TLI.getOperationAction(ISD::SIGN_EXTEND_INREG, ExtVT) ==
+ TargetLowering::Legal) {
+ // Apply SRL on top of the SHL nodes, SimplifyDemandedBits will clean
+ // this up. It looks messy but its a lot simpler than handling all the
+ // possible shift value type mismatches we could have....
+ SDLoc DL(N);
+ SDValue LHS = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
+ SDValue RHS = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(1), N1);
+ SDValue Or = DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Or,
+ DAG.getValueType(ExtVT));
+ }
+ }
+ }
+
// fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
// clamp (add c1, c2) to max shift.
if (N0.getOpcode() == ISD::SRA) {
ret void
}
-; TODO ashr(or(shl(x,c1),shl(y,c2)),c1) -> sign_extend_inreg(or(x,shl(y,c2-c1))) iff c2 >= c1
+; ashr(or(shl(x,c1),shl(y,c2)),c1) -> sign_extend_inreg(or(x,shl(y,c2-c1))) iff c2 >= c1
define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x, i32 addrspace(1)* %y) {
; SI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s2, s[6:7], 0x0
-; SI-NEXT: s_load_dword s0, s[0:1], 0x0
-; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_load_dword s4, s[4:5], 0x0
+; SI-NEXT: s_load_dword s2, s[2:3], 0x0
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_lshl_b32 s1, s2, 17
-; SI-NEXT: s_lshl_b32 s0, s0, 19
-; SI-NEXT: s_or_b32 s0, s1, s0
-; SI-NEXT: s_ashr_i32 s0, s0, 17
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_lshl_b32 s4, s4, 2
+; SI-NEXT: s_or_b32 s2, s2, s4
+; SI-NEXT: s_bfe_i32 s4, s2, 0xf0000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: s_load_dword s4, s[4:5], 0x0
+; VI-NEXT: s_load_dword s2, s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s1, s2, 17
-; VI-NEXT: s_lshl_b32 s0, s0, 19
-; VI-NEXT: s_or_b32 s0, s1, s0
-; VI-NEXT: s_ashr_i32 s0, s0, 17
+; VI-NEXT: s_lshl_b32 s0, s4, 2
+; VI-NEXT: s_or_b32 s0, s2, s0
+; VI-NEXT: s_bfe_i32 s0, s0, 0xf0000
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm