From 0eeab8b2825ca9582b211fb5fbe782f702b30db7 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 27 Aug 2021 09:51:05 -0700
Subject: [PATCH] [RISCV] Add -riscv-v-fixed-length-vector-elen-max to limit
 the ELEN used for fixed length vectorization.

This adds an ELEN limit for fixed length vectors. This will scalarize
any elements larger than this. It will also disable some fractional
LMULs. For example, if ELEN=32 then mf8 becomes illegal, i32/f32
vectors can't use any fractional LMULs, i16/f16 can only use mf2,
and i8 can use mf2 and mf4.

We may also need something for the scalable vectors, but that has
interactions with the intrinsics and we can't scalarize a scalable
vector.

Longer term this should come from one of the Zve* features
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp       |  17 +-
 llvm/lib/Target/RISCV/RISCVSubtarget.cpp          |  18 ++-
 llvm/lib/Target/RISCV/RISCVSubtarget.h            |   1 +
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h  |  12 ++
 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll | 184 ++++++++++++++++++++++
 5 files changed, 227 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0717432..790f795b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1204,8 +1204,10 @@ static bool useRVVForFixedLengthVectorVT(MVT VT,
 
   unsigned MinVLen = Subtarget.getMinRVVVectorSizeInBits();
 
+  MVT EltVT = VT.getVectorElementType();
+
   // Don't use RVV for vectors we cannot scalarize if required.
-  switch (VT.getVectorElementType().SimpleTy) {
+  switch (EltVT.SimpleTy) {
   // i1 is supported but has different rules.
   default:
     return false;
@@ -1234,6 +1236,10 @@ static bool useRVVForFixedLengthVectorVT(MVT VT,
     break;
   }
 
+  // Reject elements larger than ELEN.
+  if (EltVT.getSizeInBits() > Subtarget.getMaxELENForFixedLengthVectors())
+    return false;
+
   unsigned LMul = divideCeil(VT.getSizeInBits(), MinVLen);
   // Don't use RVV for types that don't fit.
   if (LMul > Subtarget.getMaxLMULForFixedLengthVectors())
@@ -1260,6 +1266,7 @@ static MVT getContainerForFixedLengthVector(const TargetLowering &TLI, MVT VT,
          "Expected legal fixed length vector!");
 
   unsigned MinVLen = Subtarget.getMinRVVVectorSizeInBits();
+  unsigned MaxELen = Subtarget.getMaxELENForFixedLengthVectors();
 
   MVT EltVT = VT.getVectorElementType();
   switch (EltVT.SimpleTy) {
@@ -1274,10 +1281,12 @@ static MVT getContainerForFixedLengthVector(const TargetLowering &TLI, MVT VT,
   case MVT::f32:
   case MVT::f64: {
     // We prefer to use LMUL=1 for VLEN sized types. Use fractional lmuls for
-    // narrower types, but we can't have a fractional LMUL with demoninator less
-    // than 64/SEW.
+    // narrower types. The smallest fractional LMUL we support is 8/ELEN. Within
+    // each fractional LMUL we support SEW between 8 and LMUL*ELEN.
     unsigned NumElts =
-        divideCeil(VT.getVectorNumElements(), MinVLen / RISCV::RVVBitsPerBlock);
+        (VT.getVectorNumElements() * RISCV::RVVBitsPerBlock) / MinVLen;
+    NumElts = std::max(NumElts, RISCV::RVVBitsPerBlock / MaxELen);
+    assert(isPowerOf2_32(NumElts) && "Expected power of 2 NumElts");
     return MVT::getScalableVectorVT(EltVT, NumElts);
   }
   }
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index b19fdcb..56437b7 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -45,6 +45,11 @@ static cl::opt<unsigned> RVVVectorLMULMax(
              "Fractional LMUL values are not supported."),
     cl::init(8), cl::Hidden);
 
+static cl::opt<unsigned> RVVVectorELENMax(
+    "riscv-v-fixed-length-vector-elen-max",
+    cl::desc("The maximum ELEN value to use for fixed length vectors."),
+    cl::init(64), cl::Hidden);
+
 void RISCVSubtarget::anchor() {}
 
 RISCVSubtarget &
@@ -142,7 +147,18 @@ unsigned RISCVSubtarget::getMaxLMULForFixedLengthVectors() const {
          "Tried to get maximum LMUL without V extension support!");
   assert(RVVVectorLMULMax <= 8 && isPowerOf2_32(RVVVectorLMULMax) &&
          "V extension requires a LMUL to be at most 8 and a power of 2!");
-  return PowerOf2Floor(std::max<unsigned>(RVVVectorLMULMax, 1));
+  return PowerOf2Floor(
+      std::max<unsigned>(std::min<unsigned>(RVVVectorLMULMax, 8), 1));
+}
+
+unsigned RISCVSubtarget::getMaxELENForFixedLengthVectors() const {
+  assert(hasStdExtV() &&
+         "Tried to get maximum ELEN without V extension support!");
+  assert(RVVVectorELENMax <= 64 && RVVVectorELENMax >= 8 &&
+         isPowerOf2_32(RVVVectorELENMax) &&
+         "V extension requires a ELEN to be a power of 2 between 8 and 64!");
+  return PowerOf2Floor(
+      std::max<unsigned>(std::min<unsigned>(RVVVectorELENMax, 64), 8));
 }
 
 bool RISCVSubtarget::useRVVForFixedLengthVectors() const {
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index ce36331..cf33ebf 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -158,6 +158,7 @@ public:
   unsigned getMaxRVVVectorSizeInBits() const;
   unsigned getMinRVVVectorSizeInBits() const;
   unsigned getMaxLMULForFixedLengthVectors() const;
+  unsigned getMaxELENForFixedLengthVectors() const;
   bool useRVVForFixedLengthVectors() const;
 };
 } // End llvm namespace
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 7be85cf..1c475bd 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -104,6 +104,12 @@ public:
     if (isa<FixedVectorType>(DataType) && ST->getMinRVVVectorSizeInBits() == 0)
       return false;
 
+    // Don't allow elements larger than the ELEN.
+    // FIXME: How to limit for scalable vectors?
+    if (isa<FixedVectorType>(DataType) &&
+        DataType->getScalarSizeInBits() > ST->getMaxELENForFixedLengthVectors())
+      return false;
+
     if (Alignment <
         DL.getTypeStoreSize(DataType->getScalarType()).getFixedSize())
       return false;
@@ -126,6 +132,12 @@ public:
     if (isa<FixedVectorType>(DataType) && ST->getMinRVVVectorSizeInBits() == 0)
       return false;
 
+    // Don't allow elements larger than the ELEN.
+    // FIXME: How to limit for scalable vectors?
+    if (isa<FixedVectorType>(DataType) &&
+        DataType->getScalarSizeInBits() > ST->getMaxELENForFixedLengthVectors())
+      return false;
+
     if (Alignment <
         DL.getTypeStoreSize(DataType->getScalarType()).getFixedSize())
       return false;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
new file mode 100644
index 0000000..8e78e89
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
@@ -0,0 +1,184 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+d,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-elen-max=32 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-elen-max=32 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+
+; Test that limiting ELEN, scalarizes elements larger than that and disables
+; some fractional LMULs.
+
+; This should use LMUL=1.
+define void @add_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
+; CHECK-LABEL: add_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vle32.v v25, (a0)
+; CHECK-NEXT:    vle32.v v26, (a1)
+; CHECK-NEXT:    vadd.vv v25, v25, v26
+; CHECK-NEXT:    vse32.v v25, (a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i32>, <4 x i32>* %x
+  %b = load <4 x i32>, <4 x i32>* %y
+  %c = add <4 x i32> %a, %b
+  store <4 x i32> %c, <4 x i32>* %x
+  ret void
+}
+
+; i64 vectors should be scalarized
+define void @add_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
+; RV32-LABEL: add_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 8(a0)
+; RV32-NEXT:    lw a6, 12(a0)
+; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a7, 4(a0)
+; RV32-NEXT:    lw a3, 4(a1)
+; RV32-NEXT:    lw a5, 0(a1)
+; RV32-NEXT:    lw t0, 8(a1)
+; RV32-NEXT:    lw a1, 12(a1)
+; RV32-NEXT:    add a3, a7, a3
+; RV32-NEXT:    add a5, a4, a5
+; RV32-NEXT:    sltu a4, a5, a4
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    add a1, a6, a1
+; RV32-NEXT:    add a4, a2, t0
+; RV32-NEXT:    sltu a2, a4, a2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    sw a4, 8(a0)
+; RV32-NEXT:    sw a5, 0(a0)
+; RV32-NEXT:    sw a1, 12(a0)
+; RV32-NEXT:    sw a3, 4(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: add_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a2, 8(a0)
+; RV64-NEXT:    ld a3, 0(a0)
+; RV64-NEXT:    ld a4, 0(a1)
+; RV64-NEXT:    ld a1, 8(a1)
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    sd a1, 8(a0)
+; RV64-NEXT:    sd a3, 0(a0)
+; RV64-NEXT:    ret
+  %a = load <2 x i64>, <2 x i64>* %x
+  %b = load <2 x i64>, <2 x i64>* %y
+  %c = add <2 x i64> %a, %b
+  store <2 x i64> %c, <2 x i64>* %x
+  ret void
+}
+
+; This should use LMUL=1 becuase there are no fractional i32 LMULs with ELEN=32
+define void @add_v2i32(<2 x i32>* %x, <2 x i32>* %y) {
+; CHECK-LABEL: add_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
+; CHECK-NEXT:    vle32.v v25, (a0)
+; CHECK-NEXT:    vle32.v v26, (a1)
+; CHECK-NEXT:    vadd.vv v25, v25, v26
+; CHECK-NEXT:    vse32.v v25, (a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i32>, <2 x i32>* %x
+  %b = load <2 x i32>, <2 x i32>* %y
+  %c = add <2 x i32> %a, %b
+  store <2 x i32> %c, <2 x i32>* %x
+  ret void
+}
+
+; i64 vectors should be scalarized
+define void @add_v1i64(<1 x i64>* %x, <1 x i64>* %y) {
+; RV32-LABEL: add_v1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 0(a0)
+; RV32-NEXT:    lw a3, 4(a0)
+; RV32-NEXT:    lw a4, 4(a1)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    sltu a2, a1, a2
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    sw a1, 0(a0)
+; RV32-NEXT:    sw a2, 4(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: add_v1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a2, 0(a0)
+; RV64-NEXT:    ld a1, 0(a1)
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    sd a1, 0(a0)
+; RV64-NEXT:    ret
+  %a = load <1 x i64>, <1 x i64>* %x
+  %b = load <1 x i64>, <1 x i64>* %y
+  %c = add <1 x i64> %a, %b
+  store <1 x i64> %c, <1 x i64>* %x
+  ret void
+}
+
+; This should use LMUL=1.
+define void @fadd_v4f32(<4 x float>* %x, <4 x float>* %y) {
+; CHECK-LABEL: fadd_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vle32.v v25, (a0)
+; CHECK-NEXT:    vle32.v v26, (a1)
+; CHECK-NEXT:    vfadd.vv v25, v25, v26
+; CHECK-NEXT:    vse32.v v25, (a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x float>, <4 x float>* %x
+  %b = load <4 x float>, <4 x float>* %y
+  %c = fadd <4 x float> %a, %b
+  store <4 x float> %c, <4 x float>* %x
+  ret void
+}
+
+; double vectors should be scalarized
+define void @fadd_v2f64(<2 x double>* %x, <2 x double>* %y) {
+; CHECK-LABEL: fadd_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fld ft0, 8(a0)
+; CHECK-NEXT:    fld ft1, 0(a0)
+; CHECK-NEXT:    fld ft2, 0(a1)
+; CHECK-NEXT:    fld ft3, 8(a1)
+; CHECK-NEXT:    fadd.d ft1, ft1, ft2
+; CHECK-NEXT:    fadd.d ft0, ft0, ft3
+; CHECK-NEXT:    fsd ft0, 8(a0)
+; CHECK-NEXT:    fsd ft1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x double>, <2 x double>* %x
+  %b = load <2 x double>, <2 x double>* %y
+  %c = fadd <2 x double> %a, %b
+  store <2 x double> %c, <2 x double>* %x
+  ret void
+}
+
+; This should use LMUL=1 becuase there are no fractional float LMULs with ELEN=32
+define void @fadd_v2f32(<2 x float>* %x, <2 x float>* %y) {
+; CHECK-LABEL: fadd_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
+; CHECK-NEXT:    vle32.v v25, (a0)
+; CHECK-NEXT:    vle32.v v26, (a1)
+; CHECK-NEXT:    vfadd.vv v25, v25, v26
+; CHECK-NEXT:    vse32.v v25, (a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x float>, <2 x float>* %x
+  %b = load <2 x float>, <2 x float>* %y
+  %c = fadd <2 x float> %a, %b
+  store <2 x float> %c, <2 x float>* %x
+  ret void
+}
+
+; double vectors should be scalarized
+define void @fadd_v1f64(<1 x double>* %x, <1 x double>* %y) {
+; CHECK-LABEL: fadd_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fld ft0, 0(a0)
+; CHECK-NEXT:    fld ft1, 0(a1)
+; CHECK-NEXT:    fadd.d ft0, ft0, ft1
+; CHECK-NEXT:    fsd ft0, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <1 x double>, <1 x double>* %x
+  %b = load <1 x double>, <1 x double>* %y
+  %c = fadd <1 x double> %a, %b
+  store <1 x double> %c, <1 x double>* %x
+  ret void
+}
-- 
2.7.4