From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 6 Oct 2021 17:14:06 +0000 (+0100)
Subject: [SelectionDAG] Widen scalable-vector loads/stores via VP_LOAD/VP_STORE
X-Git-Tag: upstream/15.0.7~26196
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=332318ffb613907a268aa909552e3eb16bc1af6b;p=platform%2Fupstream%2Fllvm.git

[SelectionDAG] Widen scalable-vector loads/stores via VP_LOAD/VP_STORE

This patch fixes a compiler crash when widening scalable-vector loads
and stores which end up breaking down to element-wise store operations.
It does so by providing a way for targets with support for
vector-predicated loads and stores to use those instead. By widening the
operation but maintaining the original effective operation length via
the EVL, only the intended vector elements are loaded or stored.

This method should in theory be possible and even preferred for
fixed-length vector types, but all fixed-length types can be broken down
into their elements, and regardless I have observed regressions in the
generated code when doing so. I believe this is simply due to
VP_LOAD/VP_STORE not being up to par with LOAD/STORE in terms of
optimization. It does improve performance on smaller self-contained
examples, however, so the potential is there.

While the only target that benefits from this is RISCV, the legalization
is generic and so was placed centrally.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D111248
---

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index f42f566..b5ff6b5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4178,23 +4178,48 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
   else
     Result = GenWidenVectorLoads(LdChain, LD);
 
-  if (!Result)
-    report_fatal_error("Unable to widen vector load");
+  if (Result) {
+    // If we generate a single load, we can use that for the chain.  Otherwise,
+    // build a factor node to remember the multiple loads are independent and
+    // chain to that.
+    SDValue NewChain;
+    if (LdChain.size() == 1)
+      NewChain = LdChain[0];
+    else
+      NewChain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, LdChain);
 
-  // If we generate a single load, we can use that for the chain.  Otherwise,
-  // build a factor node to remember the multiple loads are independent and
-  // chain to that.
-  SDValue NewChain;
-  if (LdChain.size() == 1)
-    NewChain = LdChain[0];
-  else
-    NewChain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, LdChain);
+    // Modified the chain - switch anything that used the old chain to use
+    // the new one.
+    ReplaceValueWith(SDValue(N, 1), NewChain);
 
-  // Modified the chain - switch anything that used the old chain to use
-  // the new one.
-  ReplaceValueWith(SDValue(N, 1), NewChain);
+    return Result;
+  }
 
-  return Result;
+  // Generate a vector-predicated load if it is custom/legal on the target. To
+  // avoid possible recursion, only do this if the widened mask type is legal.
+  // FIXME: Not all targets may support EVL in VP_LOAD. These will have been
+  // removed from the IR by the ExpandVectorPredication pass but we're
+  // reintroducing them here.
+  EVT LdVT = LD->getMemoryVT();
+  EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), LdVT);
+  EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                    WideVT.getVectorElementCount());
+  if (ExtType == ISD::NON_EXTLOAD && WideVT.isScalableVector() &&
+      TLI.isOperationLegalOrCustom(ISD::VP_LOAD, WideVT) &&
+      TLI.isTypeLegal(WideMaskVT)) {
+    SDLoc DL(N);
+    SDValue Mask = DAG.getAllOnesConstant(DL, WideMaskVT);
+    MVT EVLVT = TLI.getVPExplicitVectorLengthTy();
+    unsigned NumVTElts = LdVT.getVectorMinNumElements();
+    SDValue EVL =
+        DAG.getVScale(DL, EVLVT, APInt(EVLVT.getScalarSizeInBits(), NumVTElts));
+    const auto *MMO = LD->getMemOperand();
+    return DAG.getLoadVP(WideVT, DL, LD->getChain(), LD->getBasePtr(), Mask,
+                         EVL, MMO->getPointerInfo(), MMO->getAlign(),
+                         MMO->getFlags(), MMO->getAAInfo());
+  }
+
+  report_fatal_error("Unable to widen vector load");
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
@@ -5035,13 +5060,42 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
     return TLI.scalarizeVectorStore(ST, DAG);
 
   SmallVector<SDValue, 16> StChain;
-  if (!GenWidenVectorStores(StChain, ST))
-    report_fatal_error("Unable to widen vector store");
+  if (GenWidenVectorStores(StChain, ST)) {
+    if (StChain.size() == 1)
+      return StChain[0];
+
+    return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain);
+  }
 
-  if (StChain.size() == 1)
-    return StChain[0];
+  // Generate a vector-predicated store if it is custom/legal on the target.
+  // To avoid possible recursion, only do this if the widened mask type is
+  // legal.
+  // FIXME: Not all targets may support EVL in VP_STORE. These will have been
+  // removed from the IR by the ExpandVectorPredication pass but we're
+  // reintroducing them here.
+  SDValue StVal = ST->getValue();
+  EVT StVT = StVal.getValueType();
+  EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StVT);
+  EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                    WideVT.getVectorElementCount());
+  if (WideVT.isScalableVector() &&
+      TLI.isOperationLegalOrCustom(ISD::VP_STORE, WideVT) &&
+      TLI.isTypeLegal(WideMaskVT)) {
+    // Widen the value.
+    SDLoc DL(N);
+    StVal = GetWidenedVector(StVal);
+    SDValue Mask = DAG.getAllOnesConstant(DL, WideMaskVT);
+    MVT EVLVT = TLI.getVPExplicitVectorLengthTy();
+    unsigned NumVTElts = StVT.getVectorMinNumElements();
+    SDValue EVL =
+        DAG.getVScale(DL, EVLVT, APInt(EVLVT.getScalarSizeInBits(), NumVTElts));
+    const auto *MMO = ST->getMemOperand();
+    return DAG.getStoreVP(ST->getChain(), DL, StVal, ST->getBasePtr(), Mask,
+                          EVL, MMO->getPointerInfo(), MMO->getAlign(),
+                          MMO->getFlags(), MMO->getAAInfo());
+  }
 
-  return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain);
+  report_fatal_error("Unable to widen vector store");
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
index 7676b29..aedf2b8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
@@ -1,16 +1,33 @@
-; RUN: not --crash llc -mtriple=riscv32 -mattr=+m,+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs < %s
-; RUN: not --crash llc -mtriple=riscv64 -mattr=+m,+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs < %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs < %s | FileCheck %s
 
 ; Check that we are able to legalize scalable-vector loads that require widening.
 
-; FIXME: LLVM can't yet widen scalable-vector loads.
-
 define <vscale x 3 x i8> @load_nxv3i8(<vscale x 3 x i8>* %ptr) {
+; CHECK-LABEL: load_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a1, a1, 3
+; CHECK-NEXT:    slli a2, a1, 1
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    ret
   %v = load <vscale x 3 x i8>, <vscale x 3 x i8>* %ptr
   ret <vscale x 3 x i8> %v
 }
 
 define <vscale x 5 x half> @load_nxv5f16(<vscale x 5 x half>* %ptr) {
+; CHECK-LABEL: load_nxv5f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a1, a1, 3
+; CHECK-NEXT:    slli a2, a1, 2
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    ret
   %v = load <vscale x 5 x half>, <vscale x 5 x half>* %ptr
   ret <vscale x 5 x half> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
index 2c52e1b..bd7993c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
@@ -1,16 +1,33 @@
-; RUN: not --crash llc -mtriple=riscv32 -mattr=+m,+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs < %s
-; RUN: not --crash llc -mtriple=riscv64 -mattr=+m,+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs < %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs < %s | FileCheck %s
 
 ; Check that we are able to legalize scalable-vector stores that require widening.
 
-; FIXME: LLVM can't yet widen scalable-vector stores.
-
 define void @store_nxv3i8(<vscale x 3 x i8> %val, <vscale x 3 x i8>* %ptr) {
+; CHECK-LABEL: store_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a1, a1, 3
+; CHECK-NEXT:    slli a2, a1, 1
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    ret
   store <vscale x 3 x i8> %val, <vscale x 3 x i8>* %ptr
   ret void
 }
 
 define void @store_nxv7f64(<vscale x 7 x double> %val, <vscale x 7 x double>* %ptr) {
+; CHECK-LABEL: store_nxv7f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a1, a1, 3
+; CHECK-NEXT:    slli a2, a1, 3
+; CHECK-NEXT:    sub a1, a2, a1
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    ret
   store <vscale x 7 x double> %val, <vscale x 7 x double>* %ptr
   ret void
 }