EVT PromotedType = Op1Promoted.getValueType();
unsigned NewBits = PromotedType.getScalarSizeInBits();
+ if (Opcode == ISD::UADDSAT) {
+ APInt MaxVal = APInt::getAllOnesValue(OldBits).zext(NewBits);
+ SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType);
+ SDValue Add =
+ DAG.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted);
+ return DAG.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax);
+ }
+
// USUBSAT can always be promoted as long as we have zero-extended the args.
if (Opcode == ISD::USUBSAT)
return DAG.getNode(ISD::USUBSAT, dl, PromotedType, Op1Promoted,
case ISD::SSHLSAT:
ShiftOp = ISD::SRA;
break;
- case ISD::UADDSAT:
case ISD::USHLSAT:
ShiftOp = ISD::SRL;
break;
return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount);
}
- if (Opcode == ISD::UADDSAT) {
- APInt MaxVal = APInt::getAllOnesValue(OldBits).zext(NewBits);
- SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType);
- SDValue Add =
- DAG.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted);
- return DAG.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax);
- }
-
unsigned AddOp = Opcode == ISD::SADDSAT ? ISD::ADD : ISD::SUB;
APInt MinVal = APInt::getSignedMinValue(OldBits).sext(NewBits);
APInt MaxVal = APInt::getSignedMaxValue(OldBits).sext(NewBits);
; CHECK-NEXT: ldrb w9, [x1]
; CHECK-NEXT: ldrb w10, [x0, #1]
; CHECK-NEXT: ldrb w11, [x1, #1]
+; CHECK-NEXT: ldrb w12, [x0, #2]
; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ldrb w8, [x1, #2]
; CHECK-NEXT: fmov s1, w9
-; CHECK-NEXT: ldrb w8, [x0, #2]
-; CHECK-NEXT: ldrb w9, [x1, #2]
; CHECK-NEXT: mov v0.h[1], w10
+; CHECK-NEXT: ldrb w9, [x0, #3]
+; CHECK-NEXT: ldrb w10, [x1, #3]
; CHECK-NEXT: mov v1.h[1], w11
-; CHECK-NEXT: ldrb w10, [x0, #3]
-; CHECK-NEXT: ldrb w11, [x1, #3]
-; CHECK-NEXT: mov v0.h[2], w8
-; CHECK-NEXT: mov v1.h[2], w9
-; CHECK-NEXT: mov v0.h[3], w10
-; CHECK-NEXT: mov v1.h[3], w11
-; CHECK-NEXT: shl v1.4h, v1.4h, #8
-; CHECK-NEXT: shl v0.4h, v0.4h, #8
-; CHECK-NEXT: uqadd v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ushr v0.4h, v0.4h, #8
+; CHECK-NEXT: mov v0.h[2], w12
+; CHECK-NEXT: mov v1.h[2], w8
+; CHECK-NEXT: mov v0.h[3], w9
+; CHECK-NEXT: mov v1.h[3], w10
+; CHECK-NEXT: movi d2, #0xff00ff00ff00ff
+; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: umin v0.4h, v0.4h, v2.4h
; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: str s0, [x2]
; CHECK-NEXT: ret
; CHECK-NEXT: ldrb w10, [x0, #1]
; CHECK-NEXT: ldrb w11, [x1, #1]
; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: mov v0.s[1], w10
-; CHECK-NEXT: mov v1.s[1], w11
-; CHECK-NEXT: shl v1.2s, v1.2s, #24
-; CHECK-NEXT: shl v0.2s, v0.2s, #24
-; CHECK-NEXT: uqadd v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ushr v0.2s, v0.2s, #24
+; CHECK-NEXT: mov v2.s[1], w11
+; CHECK-NEXT: movi d1, #0x0000ff000000ff
+; CHECK-NEXT: add v0.2s, v0.2s, v2.2s
+; CHECK-NEXT: umin v0.2s, v0.2s, v1.2s
; CHECK-NEXT: mov w8, v0.s[1]
; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: strb w9, [x2]
; CHECK-NEXT: ldrh w10, [x0, #2]
; CHECK-NEXT: ldrh w11, [x1, #2]
; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: mov v0.s[1], w10
-; CHECK-NEXT: mov v1.s[1], w11
-; CHECK-NEXT: shl v1.2s, v1.2s, #16
-; CHECK-NEXT: shl v0.2s, v0.2s, #16
-; CHECK-NEXT: uqadd v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ushr v0.2s, v0.2s, #16
+; CHECK-NEXT: mov v2.s[1], w11
+; CHECK-NEXT: movi d1, #0x00ffff0000ffff
+; CHECK-NEXT: add v0.2s, v0.2s, v2.2s
+; CHECK-NEXT: umin v0.2s, v0.2s, v1.2s
; CHECK-NEXT: mov w8, v0.s[1]
; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: strh w9, [x2]
; CHECK-LABEL: v16i4:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v2.16b, #15
-; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: shl v1.16b, v1.16b, #4
-; CHECK-NEXT: shl v0.16b, v0.16b, #4
-; CHECK-NEXT: uqadd v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ushr v0.16b, v0.16b, #4
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: add v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: umin v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%z = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
ret <16 x i4> %z
; GFX8-LABEL: v_uaddsat_i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp
-; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
+; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-NEXT: v_min_u16_e32 v0, 0xff, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uaddsat_i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp
-; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NEXT: v_min_u16_e32 v0, 0xff, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs)
ret i8 %result
define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; SSE-LABEL: v16i4:
; SSE: # %bb.0:
-; SSE-NEXT: psllw $4, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: psllw $4, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: paddusb %xmm1, %xmm0
-; SSE-NEXT: psrlw $4, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: paddb %xmm1, %xmm0
+; SSE-NEXT: pminub %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: v16i4:
; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpsllw $4, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
%z = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
ret <16 x i4> %z