[AArch64][SVE] Fix fptrunc store for fixed len vector

author Peter Waller <peter.waller@arm.com>

Mon, 6 Dec 2021 17:12:55 +0000 (17:12 +0000)

committer Peter Waller <peter.waller@arm.com>

Tue, 7 Dec 2021 12:22:07 +0000 (12:22 +0000)
author Peter Waller <peter.waller@arm.com>
Mon, 6 Dec 2021 17:12:55 +0000 (17:12 +0000)
committer Peter Waller <peter.waller@arm.com>
Tue, 7 Dec 2021 12:22:07 +0000 (12:22 +0000)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

index 688bd6b..afafaa7 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15985,7 +15985,9 @@ static SDValue performSTORECombine(SDNode *N,
    if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
        Value.getNode()->hasOneUse() && ST->isUnindexed() &&
        Subtarget->useSVEForFixedLengthVectors() &&
-      Value.getValueType().isFixedLengthVector())
+      Value.getValueType().isFixedLengthVector() &&
+      Value.getValueType().getFixedSizeInBits() >=
+          Subtarget->getMinSVEVectorSizeInBits())
      return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
                               ST->getMemoryVT(), ST->getMemOperand());
  
@@ -17346,7 +17348,8 @@ SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
    // they can be split down into something legal.
    if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
        N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
-      VT.isFixedLengthVector()) {
+      VT.isFixedLengthVector() &&
+      VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
      SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
                                       LN0->getChain(), LN0->getBasePtr(),
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll

index 3857b88..12a1a00 100644 (file)
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
@@ -25,24 +25,31 @@ target triple = "aarch64-unknown-linux-gnu"
  ;
  
  ; Don't use SVE for 64-bit vectors.
-define <2 x float> @fcvt_v2f16_v2f32(<2 x half> %op1) #0 {
+define void @fcvt_v2f16_v2f32(<2 x half>* %a, <2 x float>* %b) #0 {
  ; CHECK-LABEL: fcvt_v2f16_v2f32:
  ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
  ; CHECK-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    str d0, [x1]
  ; CHECK-NEXT:    ret
+  %op1 = load <2 x half>, <2 x half>* %a
    %res = fpext <2 x half> %op1 to <2 x float>
-  ret <2 x float> %res
+  store <2 x float> %res, <2 x float>* %b
+  ret void
  }
  
  ; Don't use SVE for 128-bit vectors.
-define <4 x float> @fcvt_v4f16_v4f32(<4 x half> %op1) #0 {
+define void @fcvt_v4f16_v4f32(<4 x half>* %a, <4 x float>* %b) #0 {
  ; CHECK-LABEL: fcvt_v4f16_v4f32:
  ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
  ; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    str q0, [x1]
  ; CHECK-NEXT:    ret
+  %op1 = load <4 x half>, <4 x half>* %a
    %res = fpext <4 x half> %op1 to <4 x float>
-  ret <4 x float> %res
+  store <4 x float> %res, <4 x float>* %b
+  ret void
  }
  
  define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) #0 {
@@ -120,28 +127,34 @@ define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) #0 {
  ;
  
  ; Don't use SVE for 64-bit vectors.
-define <1 x double> @fcvt_v1f16_v1f64(<1 x half> %op1) #0 {
+define void @fcvt_v1f16_v1f64(<1 x half>* %a, <1 x double>* %b) #0 {
  ; CHECK-LABEL: fcvt_v1f16_v1f64:
  ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
  ; CHECK-NEXT:    fcvt d0, h0
+; CHECK-NEXT:    str d0, [x1]
  ; CHECK-NEXT:    ret
+  %op1 = load <1 x half>, <1 x half>* %a
    %res = fpext <1 x half> %op1 to <1 x double>
-  ret <1 x double> %res
+  store <1 x double> %res, <1 x double>* %b
+  ret void
  }
  
  ; v2f16 is not legal for NEON, so use SVE
-define <2 x double> @fcvt_v2f16_v2f64(<2 x half> %op1) #0 {
+define void @fcvt_v2f16_v2f64(<2 x half>* %a, <2 x double>* %b) #0 {
  ; CHECK-LABEL: fcvt_v2f16_v2f64:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ldr s0, [x0]
  ; CHECK-NEXT:    ptrue p0.d, vl4
  ; CHECK-NEXT:    uunpklo z0.s, z0.h
  ; CHECK-NEXT:    uunpklo z0.d, z0.s
  ; CHECK-NEXT:    fcvt z0.d, p0/m, z0.h
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str q0, [x1]
  ; CHECK-NEXT:    ret
+  %op1 = load <2 x half>, <2 x half>* %a
    %res = fpext <2 x half> %op1 to <2 x double>
-  ret <2 x double> %res
+  store <2 x double> %res, <2 x double>* %b
+  ret void
  }
  
  define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) #0 {
@@ -218,24 +231,31 @@ define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) #0 {
  ;
  
  ; Don't use SVE for 64-bit vectors.
-define <1 x double> @fcvt_v1f32_v1f64(<1 x float> %op1) #0 {
+define void @fcvt_v1f32_v1f64(<1 x float>* %a, <1 x double>* %b) #0 {
  ; CHECK-LABEL: fcvt_v1f32_v1f64:
  ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
  ; CHECK-NEXT:    fcvtl v0.2d, v0.2s
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    str d0, [x1]
  ; CHECK-NEXT:    ret
+  %op1 = load <1 x float>, <1 x float>* %a
    %res = fpext <1 x float> %op1 to <1 x double>
-  ret <1 x double> %res
+  store <1 x double> %res, <1 x double>* %b
+  ret void
  }
  
  ; Don't use SVE for 128-bit vectors.
-define <2 x double> @fcvt_v2f32_v2f64(<2 x float> %op1) #0 {
+define void @fcvt_v2f32_v2f64(<2 x float>* %a, <2 x double>* %b) #0 {
  ; CHECK-LABEL: fcvt_v2f32_v2f64:
  ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
  ; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    str q0, [x1]
  ; CHECK-NEXT:    ret
+  %op1 = load <2 x float>, <2 x float>* %a
    %res = fpext <2 x float> %op1 to <2 x double>
-  ret <2 x double> %res
+  store <2 x double> %res, <2 x double>* %b
+  ret void
  }
  
  define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) #0 {
@@ -273,7 +293,6 @@ define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 {
  ; VBITS_GE_512-NEXT:    fcvt z0.d, p0/m, z0.s
  ; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
  ; VBITS_GE_512-NEXT:    ret
-
    %op1 = load <8 x float>, <8 x float>* %a
    %res = fpext <8 x float> %op1 to <8 x double>
    store <8 x double> %res, <8 x double>* %b
@@ -313,39 +332,45 @@ define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) #0 {
  ;
  
  ; Don't use SVE for 64-bit vectors.
-define <2 x half> @fcvt_v2f32_v2f16(<2 x float> %op1) #0 {
+define void @fcvt_v2f32_v2f16(<2 x float>* %a, <2 x half>* %b) #0 {
  ; CHECK-LABEL: fcvt_v2f32_v2f16:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    ldr d0, [x0]
  ; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    str s0, [x1]
  ; CHECK-NEXT:    ret
+  %op1 = load <2 x float>, <2 x float>* %a
    %res = fptrunc <2 x float> %op1 to <2 x half>
-  ret <2 x half> %res
+  store <2 x half> %res, <2 x half>* %b
+  ret void
  }
  
  ; Don't use SVE for 128-bit vectors.
-define <4 x half> @fcvt_v4f32_v4f16(<4 x float> %op1) #0 {
+define void @fcvt_v4f32_v4f16(<4 x float>* %a, <4 x half>* %b) #0 {
  ; CHECK-LABEL: fcvt_v4f32_v4f16:
  ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
  ; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    str d0, [x1]
  ; CHECK-NEXT:    ret
+  %op1 = load <4 x float>, <4 x float>* %a
    %res = fptrunc <4 x float> %op1 to <4 x half>
-  ret <4 x half> %res
+  store <4 x half> %res, <4 x half>* %b
+  ret void
  }
  
-define <8 x half> @fcvt_v8f32_v8f16(<8 x float>* %a) #0 {
+define void @fcvt_v8f32_v8f16(<8 x float>* %a, <8 x half>* %b) #0 {
  ; CHECK-LABEL: fcvt_v8f32_v8f16:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ptrue p0.s, vl8
  ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ptrue p0.s
  ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
  ; CHECK-NEXT:    ret
    %op1 = load <8 x float>, <8 x float>* %a
    %res = fptrunc <8 x float> %op1 to <8 x half>
-  ret <8 x half> %res
+  store <8 x half> %res, <8 x half>* %b
+  ret void
  }
  
  define void @fcvt_v16f32_v16f16(<16 x float>* %a, <16 x half>* %b) #0 {
@@ -408,47 +433,51 @@ define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) #0 {
  ;
  
  ; Don't use SVE for 64-bit vectors.
-define <1 x half> @fcvt_v1f64_v1f16(<1 x double> %op1) #0 {
+define void @fcvt_v1f64_v1f16(<1 x double>* %a, <1 x half>* %b) #0 {
  ; CHECK-LABEL: fcvt_v1f64_v1f16:
  ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
  ; CHECK-NEXT:    fcvt h0, d0
+; CHECK-NEXT:    str h0, [x1]
  ; CHECK-NEXT:    ret
+  %op1 = load <1 x double>, <1 x double>* %a
    %res = fptrunc <1 x double> %op1 to <1 x half>
-  ret <1 x half> %res
+  store <1 x half> %res, <1 x half>* %b
+  ret void
  }
  
  ; v2f16 is not legal for NEON, so use SVE
-define <2 x half> @fcvt_v2f64_v2f16(<2 x double> %op1) #0 {
+define void @fcvt_v2f64_v2f16(<2 x double>* %a, <2 x half>* %b) #0 {
  ; CHECK-LABEL: fcvt_v2f64_v2f16:
  ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ldr q0, [x0]
  ; CHECK-NEXT:    ptrue p0.d
  ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
  ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
  ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str s0, [x1]
  ; CHECK-NEXT:    ret
+  %op1 = load <2 x double>, <2 x double>* %a
    %res = fptrunc <2 x double> %op1 to <2 x half>
-  ret <2 x half> %res
+  store <2 x half> %res, <2 x half>* %b
+  ret void
  }
  
-define <4 x half> @fcvt_v4f64_v4f16(<4 x double>* %a) #0 {
+define void @fcvt_v4f64_v4f16(<4 x double>* %a, <4 x half>* %b) #0 {
  ; CHECK-LABEL: fcvt_v4f64_v4f16:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ptrue p0.d, vl4
  ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ptrue p0.d
  ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
  ; CHECK-NEXT:    ret
    %op1 = load <4 x double>, <4 x double>* %a
    %res = fptrunc <4 x double> %op1 to <4 x half>
-  ret <4 x half> %res
+  store <4 x half> %res, <4 x half>* %b
+  ret void
  }
  
-define <8 x half> @fcvt_v8f64_v8f16(<8 x double>* %a) #0 {
+define void @fcvt_v8f64_v8f16(<8 x double>* %a, <8 x half>* %b) #0 {
  ; Ensure sensible type legalisation
  ; VBITS_EQ_256-LABEL: fcvt_v8f64_v8f16:
  ; VBITS_EQ_256:       // %bb.0:
@@ -461,25 +490,23 @@ define <8 x half> @fcvt_v8f64_v8f16(<8 x double>* %a) #0 {
  ; VBITS_EQ_256-NEXT:    fcvt z1.h, p0/m, z1.d
  ; VBITS_EQ_256-NEXT:    uzp1 z0.s, z0.s, z0.s
  ; VBITS_EQ_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_EQ_256-NEXT:    uzp1 z2.h, z0.h, z0.h
-; VBITS_EQ_256-NEXT:    uzp1 z0.h, z1.h, z1.h
-; VBITS_EQ_256-NEXT:    mov v0.d[1], v2.d[0]
-; VBITS_EQ_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_EQ_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_EQ_256-NEXT:    uzp1 z1.h, z1.h, z1.h
+; VBITS_EQ_256-NEXT:    mov v1.d[1], v0.d[0]
+; VBITS_EQ_256-NEXT:    str q1, [x1]
  ; VBITS_EQ_256-NEXT:    ret
  ;
  ; VBITS_GE_512-LABEL: fcvt_v8f64_v8f16:
  ; VBITS_GE_512:       // %bb.0:
  ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
  ; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ptrue p0.d
  ; VBITS_GE_512-NEXT:    fcvt z0.h, p0/m, z0.d
-; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_512-NEXT:    st1h { z0.d }, p0, [x1]
  ; VBITS_GE_512-NEXT:    ret
    %op1 = load <8 x double>, <8 x double>* %a
    %res = fptrunc <8 x double> %op1 to <8 x half>
-  ret <8 x half> %res
+  store <8 x half> %res, <8 x half>* %b
+  ret void
  }
  
  define void @fcvt_v16f64_v16f16(<16 x double>* %a, <16 x half>* %b) #0 {
@@ -515,39 +542,42 @@ define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) #0 {
  ;
  
  ; Don't use SVE for 64-bit vectors.
-define <1 x float> @fcvt_v1f64_v1f32(<1 x double> %op1) #0 {
+define void @fcvt_v1f64_v1f32(<1 x double> %op1, <1 x float>* %b) #0 {
  ; CHECK-LABEL: fcvt_v1f64_v1f32:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
  ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    str s0, [x0]
  ; CHECK-NEXT:    ret
    %res = fptrunc <1 x double> %op1 to <1 x float>
-  ret <1 x float> %res
+  store <1 x float> %res, <1 x float>* %b
+  ret void
  }
  
  ; Don't use SVE for 128-bit vectors.
-define <2 x float> @fcvt_v2f64_v2f32(<2 x double> %op1) #0 {
+define void @fcvt_v2f64_v2f32(<2 x double> %op1, <2 x float>* %b) #0 {
  ; CHECK-LABEL: fcvt_v2f64_v2f32:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    str d0, [x0]
  ; CHECK-NEXT:    ret
    %res = fptrunc <2 x double> %op1 to <2 x float>
-  ret <2 x float> %res
+  store <2 x float> %res, <2 x float>* %b
+  ret void
  }
  
-define <4 x float> @fcvt_v4f64_v4f32(<4 x double>* %a) #0 {
+define void @fcvt_v4f64_v4f32(<4 x double>* %a, <4 x float>* %b) #0 {
  ; CHECK-LABEL: fcvt_v4f64_v4f32:
  ; CHECK:       // %bb.0:
  ; CHECK-NEXT:    ptrue p0.d, vl4
  ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ptrue p0.d
  ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    st1w { z0.d }, p0, [x1]
  ; CHECK-NEXT:    ret
    %op1 = load <4 x double>, <4 x double>* %a
    %res = fptrunc <4 x double> %op1 to <4 x float>
-  ret <4 x float> %res
+  store <4 x float> %res, <4 x float>* %b
+  ret void
  }
  
  define void @fcvt_v8f64_v8f32(<8 x double>* %a, <8 x float>* %b) #0 {
author	Peter Waller <peter.waller@arm.com>
	Mon, 6 Dec 2021 17:12:55 +0000 (17:12 +0000)
committer	Peter Waller <peter.waller@arm.com>
	Tue, 7 Dec 2021 12:22:07 +0000 (12:22 +0000)
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll		patch \| blob \| history