setCondCodeAction(ISD::SETUNE, VT, Expand);
}
- // Mark integer truncating stores as having custom lowering
+ // Mark integer truncating stores/extending loads as having custom lowering
if (VT.isInteger()) {
MVT InnerVT = VT.changeVectorElementType(MVT::i8);
while (InnerVT != VT) {
}
}
+ // Mark floating-point truncating stores/extending loads as having custom
+ // lowering
+ if (VT.isFloatingPoint()) {
+ MVT InnerVT = VT.changeVectorElementType(MVT::f16);
+ while (InnerVT != VT) {
+ setTruncStoreAction(VT, InnerVT, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
+ InnerVT = InnerVT.changeVectorElementType(
+ MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits()));
+ }
+ }
+
// Lower fixed length vector operations to scalable equivalents.
setOperationAction(ISD::ABS, VT, Custom);
setOperationAction(ISD::ADD, VT, Custom);
SDLoc DL(Op);
EVT VT = Op.getValueType();
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+ EVT LoadVT = ContainerVT;
+ EVT MemVT = Load->getMemoryVT();
+
+ auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
+
+ if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
+ LoadVT = ContainerVT.changeTypeToInteger();
+ MemVT = MemVT.changeTypeToInteger();
+ }
auto NewLoad = DAG.getMaskedLoad(
- ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
- getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT),
- Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),
- Load->getExtensionType());
+ LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
+ DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
+ Load->getAddressingMode(), Load->getExtensionType());
+
+ if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
+ EVT ExtendVT = ContainerVT.changeVectorElementType(
+ Load->getMemoryVT().getVectorElementType());
+
+ NewLoad = getSVESafeBitCast(ExtendVT, NewLoad, DAG);
+ NewLoad = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
+ Pg, NewLoad, DAG.getUNDEF(ContainerVT));
+ }
auto Result = convertFromScalableVector(DAG, VT, NewLoad);
SDValue MergedValues[2] = {Result, Load->getChain()};
SDLoc DL(Op);
EVT VT = Store->getValue().getValueType();
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+ EVT MemVT = Store->getMemoryVT();
+ auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
- return DAG.getMaskedStore(
- Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
- getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),
- Store->getMemOperand(), Store->getAddressingMode(),
- Store->isTruncatingStore());
+
+ if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
+ EVT TruncVT = ContainerVT.changeVectorElementType(
+ Store->getMemoryVT().getVectorElementType());
+ MemVT = MemVT.changeTypeToInteger();
+ NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
+ NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
+ DAG.getUNDEF(TruncVT));
+ NewValue =
+ getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
+ }
+
+ return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
+ Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
+ Store->getMemOperand(), Store->getAddressingMode(),
+ Store->isTruncatingStore());
}
SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) #0 {
; CHECK-LABEL: fcvt_v8f16_v8f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ptrue p0.s, vl8
-; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.s, p0/m, z0.h
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1]
; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_EQ_256-NEXT: ret
-
+;
; VBITS_GE_512-LABEL: fcvt_v16f16_v16f32:
; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ptrue p0.h, vl16
-; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
-; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
+
%op1 = load <16 x half>, <16 x half>* %a
%res = fpext <16 x half> %op1 to <16 x float>
store <16 x float> %res, <16 x float>* %b
define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) #0 {
; VBITS_GE_1024-LABEL: fcvt_v32f16_v32f32:
; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) #0 {
; VBITS_GE_2048-LABEL: fcvt_v64f16_v64f32:
; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) #0 {
; CHECK-LABEL: fcvt_v4f16_v4f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ptrue p0.d, vl4
-; CHECK-NEXT: uunpklo z0.s, z0.h
-; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.d, p0/m, z0.h
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
}
define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 {
-; Ensure sensible type legalisation.
; VBITS_EQ_256-LABEL: fcvt_v8f16_v8f64:
; VBITS_EQ_256: // %bb.0:
; VBITS_EQ_256-NEXT: ldr q0, [x0]
; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.h
; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
; VBITS_EQ_256-NEXT: ret
-
+;
; VBITS_GE_512-LABEL: fcvt_v8f16_v8f64:
; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ldr q0, [x0]
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT: ld1sh { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
+
%op1 = load <8 x half>, <8 x half>* %a
%res = fpext <8 x half> %op1 to <8 x double>
store <8 x double> %res, <8 x double>* %b
define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) #0 {
; VBITS_GE_1024-LABEL: fcvt_v16f16_v16f64:
; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT: ld1sh { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) #0 {
; VBITS_GE_2048-LABEL: fcvt_v32f16_v32f64:
; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) #0 {
; CHECK-LABEL: fcvt_v4f32_v4f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ptrue p0.d, vl4
-; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.d, p0/m, z0.s
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1]
; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_EQ_256-NEXT: ret
-
+;
; VBITS_GE_512-LABEL: fcvt_v8f32_v8f64:
; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ptrue p0.s, vl8
-; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
+
%op1 = load <8 x float>, <8 x float>* %a
%res = fpext <8 x float> %op1 to <8 x double>
store <8 x double> %res, <8 x double>* %b
define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) #0 {
; VBITS_GE_1024-LABEL: fcvt_v16f32_v16f64:
; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT: ld1sw { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) #0 {
; VBITS_GE_2048-LABEL: fcvt_v32f32_v32f64:
; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT: ptrue p0.s
; VBITS_GE_512-NEXT: fcvt z0.h, p0/m, z0.s
-; VBITS_GE_512-NEXT: ptrue p0.h, vl16
-; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%res = fptrunc <16 x float> %op1 to <16 x half>
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.s
; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.s
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_1024-NEXT: st1h { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%res = fptrunc <32 x float> %op1 to <32 x half>
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.s
; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.s
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
-; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_2048-NEXT: st1h { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%res = fptrunc <64 x float> %op1 to <64 x half>
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.d
; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.d
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_1024-NEXT: st1h { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%res = fptrunc <16 x double> %op1 to <16 x half>
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.d
; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.d
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%res = fptrunc <32 x double> %op1 to <32 x half>
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT: ptrue p0.d
; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.d
-; VBITS_GE_512-NEXT: ptrue p0.s, vl8
-; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%res = fptrunc <8 x double> %op1 to <8 x float>
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.d
; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.d
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
+; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%res = fptrunc <16 x double> %op1 to <16 x float>
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.d
; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.d
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
+; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%res = fptrunc <32 x double> %op1 to <32 x float>