This allows us to model and thus test transforms which are legal only when a vector load with less than element alignment are supported. This was originally part of D126085, but was split out as we didn't have a good example of such a transform. As can be seen in the test diffs, we have the recently added concat_vector(loads) -> strided_load transform (from D147713) which now benefits from the unaligned support.
While making this change, I realized that we actually *do* support unaligned vector loads and stores of all types via conversion to i8 element type. For contiguous loads and stores without masking, we actually already implement this in the backend - though we don't tell the optimizer that. For indexed, lowering to i8 requires complicated addressing. For indexed and segmented, we'd have to use indexed. All around, doesn't seem worthwhile pursuing, but makes for an interesting observation.
Differential Revision: https://reviews.llvm.org/D149375
"true", "Has reasonably performant unaligned scalar "
"loads and stores">;
+def FeatureUnalignedVectorMem
+ : SubtargetFeature<"unaligned-vector-mem", "EnableUnalignedVectorMem",
+ "true", "Has reasonably performant unaligned vector "
+ "loads and stores">;
+
def TuneNoOptimizedZeroStrideLoad
: SubtargetFeature<"no-optimized-zero-stride-load", "HasOptimizedZeroStrideLoad",
"false", "Hasn't optimized (perform fewer memory operations)"
return true;
}
- return false;
+ // Note: We lower an unmasked unaligned vector access to an equally sized
+ // e8 element type access. Given this, we effectively support all unmasked
+ // misaligned accesses. TODO: Work through the codegen implications of
+ // allowing such accesses to be formed, and considered fast.
+ if (Fast)
+ *Fast = 0;
+ return Subtarget.enableUnalignedVectorMem();
}
bool RISCVTargetLowering::splitValueIntoRegisterParts(
if (!isLegalElementTypeForRVV(ScalarType))
return false;
- if (Alignment < DL.getTypeStoreSize(ScalarType).getFixedValue())
+ if (!Subtarget.enableUnalignedVectorMem() &&
+ Alignment < DL.getTypeStoreSize(ScalarType).getFixedValue())
return false;
return true;
if (isa<FixedVectorType>(DataType) && !ST->useRVVForFixedLengthVectors())
return false;
- if (Alignment <
- DL.getTypeStoreSize(DataType->getScalarType()).getFixedValue())
+ auto *ElemType = DataType->getScalarType();
+ if (!ST->enableUnalignedVectorMem() &&
+ Alignment < DL.getTypeStoreSize(ElemType).getFixedValue())
return false;
- return TLI->isLegalElementTypeForRVV(DataType->getScalarType());
+ return TLI->isLegalElementTypeForRVV(ElemType);
}
bool isLegalMaskedLoad(Type *DataType, Align Alignment) {
if (isa<FixedVectorType>(DataType) && !ST->useRVVForFixedLengthVectors())
return false;
- if (Alignment <
- DL.getTypeStoreSize(DataType->getScalarType()).getFixedValue())
+ auto *ElemType = DataType->getScalarType();
+ if (!ST->enableUnalignedVectorMem() &&
+ Alignment < DL.getTypeStoreSize(ElemType).getFixedValue())
return false;
- return TLI->isLegalElementTypeForRVV(DataType->getScalarType());
+ return TLI->isLegalElementTypeForRVV(ElemType);
}
bool isLegalMaskedGather(Type *DataType, Align Alignment) {
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64
-; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh,+zve64f,+zvl128b,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,ZVE64F
+; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV64
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh,+unaligned-vector-mem -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64,RV64-MISALIGN
+
+; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh,+zve64f,+zvl128b,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,ZVE64F
; The two loads are contigous and should be folded into one
define void @widen_2xv4i16(ptr %x, ptr %z) {
ret void
}
+define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) {
+; CHECK-NO-MISALIGN-LABEL: widen_4xv4i16_unaligned:
+; CHECK-NO-MISALIGN: # %bb.0:
+; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0)
+; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 8
+; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a2)
+; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 16
+; CHECK-NO-MISALIGN-NEXT: vle8.v v12, (a2)
+; CHECK-NO-MISALIGN-NEXT: addi a0, a0, 24
+; CHECK-NO-MISALIGN-NEXT: vle8.v v14, (a0)
+; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m2, tu, ma
+; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 4
+; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 12, e16, m2, tu, ma
+; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v12, 8
+; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v14, 12
+; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1)
+; CHECK-NO-MISALIGN-NEXT: ret
+;
+; RV64-MISALIGN-LABEL: widen_4xv4i16_unaligned:
+; RV64-MISALIGN: # %bb.0:
+; RV64-MISALIGN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; RV64-MISALIGN-NEXT: vle16.v v8, (a0)
+; RV64-MISALIGN-NEXT: vse16.v v8, (a1)
+; RV64-MISALIGN-NEXT: ret
+ %a = load <4 x i16>, ptr %x, align 1
+ %b.gep = getelementptr i8, ptr %x, i64 8
+ %b = load <4 x i16>, ptr %b.gep, align 1
+ %c.gep = getelementptr i8, ptr %b.gep, i64 8
+ %c = load <4 x i16>, ptr %c.gep, align 1
+ %d.gep = getelementptr i8, ptr %c.gep, i64 8
+ %d = load <4 x i16>, ptr %d.gep, align 1
+ %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ store <16 x i16> %e.2, ptr %z
+ ret void
+}
+
; Should be a strided load - with type coercion to i64
define void @strided_constant(ptr %x, ptr %z) {
; CHECK-LABEL: strided_constant:
ret void
}
-; Shouldn't be combined because the resulting load would not be aligned
define void @strided_unaligned(ptr %x, ptr %z, i64 %s) {
-; CHECK-LABEL: strided_unaligned:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: add a0, a0, a2
-; CHECK-NEXT: vle8.v v9, (a0)
-; CHECK-NEXT: vslideup.vi v8, v9, 4
-; CHECK-NEXT: vse16.v v8, (a1)
-; CHECK-NEXT: ret
+; CHECK-NO-MISALIGN-LABEL: strided_unaligned:
+; CHECK-NO-MISALIGN: # %bb.0:
+; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0)
+; CHECK-NO-MISALIGN-NEXT: add a0, a0, a2
+; CHECK-NO-MISALIGN-NEXT: vle8.v v9, (a0)
+; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v9, 4
+; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1)
+; CHECK-NO-MISALIGN-NEXT: ret
+;
+; RV64-MISALIGN-LABEL: strided_unaligned:
+; RV64-MISALIGN: # %bb.0:
+; RV64-MISALIGN-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-MISALIGN-NEXT: vlse64.v v8, (a0), a2
+; RV64-MISALIGN-NEXT: vse64.v v8, (a1)
+; RV64-MISALIGN-NEXT: ret
%a = load <4 x i16>, ptr %x, align 1
%b.gep = getelementptr i8, ptr %x, i64 %s
%b = load <4 x i16>, ptr %b.gep, align 1
; RUN: -verify-machineinstrs | FileCheck %s
; RUN: llc -mtriple riscv64 -mattr=+d,+zfh,+experimental-zvfh,+v < %s \
; RUN: -verify-machineinstrs | FileCheck %s
+; RUN: llc -mtriple riscv32 -mattr=+d,+zfh,+experimental-zvfh,+v,+unaligned-vector-mem < %s \
+; RUN: -verify-machineinstrs | FileCheck --check-prefix=UNALIGNED %s
+; RUN: llc -mtriple riscv64 -mattr=+d,+zfh,+experimental-zvfh,+v,+unaligned-vector-mem < %s \
+; RUN: -verify-machineinstrs | FileCheck --check-prefix=UNALIGNED %s
+
define <vscale x 1 x i32> @unaligned_load_nxv1i32_a1(<vscale x 1 x i32>* %ptr) {
; CHECK-LABEL: unaligned_load_nxv1i32_a1:
; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: unaligned_load_nxv1i32_a1:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
+; UNALIGNED-NEXT: vle32.v v8, (a0)
+; UNALIGNED-NEXT: ret
%v = load <vscale x 1 x i32>, <vscale x 1 x i32>* %ptr, align 1
ret <vscale x 1 x i32> %v
}
; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: unaligned_load_nxv1i32_a2:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
+; UNALIGNED-NEXT: vle32.v v8, (a0)
+; UNALIGNED-NEXT: ret
%v = load <vscale x 1 x i32>, <vscale x 1 x i32>* %ptr, align 2
ret <vscale x 1 x i32> %v
}
; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: aligned_load_nxv1i32_a4:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
+; UNALIGNED-NEXT: vle32.v v8, (a0)
+; UNALIGNED-NEXT: ret
%v = load <vscale x 1 x i32>, <vscale x 1 x i32>* %ptr, align 4
ret <vscale x 1 x i32> %v
}
; CHECK: # %bb.0:
; CHECK-NEXT: vl1r.v v8, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: unaligned_load_nxv1i64_a1:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vl1re64.v v8, (a0)
+; UNALIGNED-NEXT: ret
%v = load <vscale x 1 x i64>, <vscale x 1 x i64>* %ptr, align 1
ret <vscale x 1 x i64> %v
}
; CHECK: # %bb.0:
; CHECK-NEXT: vl1r.v v8, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: unaligned_load_nxv1i64_a4:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vl1re64.v v8, (a0)
+; UNALIGNED-NEXT: ret
%v = load <vscale x 1 x i64>, <vscale x 1 x i64>* %ptr, align 4
ret <vscale x 1 x i64> %v
}
; CHECK: # %bb.0:
; CHECK-NEXT: vl1re64.v v8, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: aligned_load_nxv1i64_a8:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vl1re64.v v8, (a0)
+; UNALIGNED-NEXT: ret
%v = load <vscale x 1 x i64>, <vscale x 1 x i64>* %ptr, align 8
ret <vscale x 1 x i64> %v
}
; CHECK: # %bb.0:
; CHECK-NEXT: vl2r.v v8, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: unaligned_load_nxv2i64_a1:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vl2re64.v v8, (a0)
+; UNALIGNED-NEXT: ret
%v = load <vscale x 2 x i64>, <vscale x 2 x i64>* %ptr, align 1
ret <vscale x 2 x i64> %v
}
; CHECK: # %bb.0:
; CHECK-NEXT: vl2r.v v8, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: unaligned_load_nxv2i64_a4:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vl2re64.v v8, (a0)
+; UNALIGNED-NEXT: ret
%v = load <vscale x 2 x i64>, <vscale x 2 x i64>* %ptr, align 4
ret <vscale x 2 x i64> %v
}
; CHECK: # %bb.0:
; CHECK-NEXT: vl2re64.v v8, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: aligned_load_nxv2i64_a8:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vl2re64.v v8, (a0)
+; UNALIGNED-NEXT: ret
%v = load <vscale x 2 x i64>, <vscale x 2 x i64>* %ptr, align 8
ret <vscale x 2 x i64> %v
}
; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
; CHECK-NEXT: vlm.v v0, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: unaligned_load_nxv1i1_a1:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
+; UNALIGNED-NEXT: vlm.v v0, (a0)
+; UNALIGNED-NEXT: ret
%v = load <vscale x 1 x i1>, <vscale x 1 x i1>* %ptr, align 1
ret <vscale x 1 x i1> %v
}
; CHECK: # %bb.0:
; CHECK-NEXT: vl2r.v v8, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: unaligned_load_nxv4f32_a1:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vl2re32.v v8, (a0)
+; UNALIGNED-NEXT: ret
%v = load <vscale x 4 x float>, <vscale x 4 x float>* %ptr, align 1
ret <vscale x 4 x float> %v
}
; CHECK: # %bb.0:
; CHECK-NEXT: vl2r.v v8, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: unaligned_load_nxv4f32_a2:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vl2re32.v v8, (a0)
+; UNALIGNED-NEXT: ret
%v = load <vscale x 4 x float>, <vscale x 4 x float>* %ptr, align 2
ret <vscale x 4 x float> %v
}
; CHECK: # %bb.0:
; CHECK-NEXT: vl2re32.v v8, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: aligned_load_nxv4f32_a4:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vl2re32.v v8, (a0)
+; UNALIGNED-NEXT: ret
%v = load <vscale x 4 x float>, <vscale x 4 x float>* %ptr, align 4
ret <vscale x 4 x float> %v
}
; CHECK: # %bb.0:
; CHECK-NEXT: vl2r.v v8, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: unaligned_load_nxv8f16_a1:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vl2re16.v v8, (a0)
+; UNALIGNED-NEXT: ret
%v = load <vscale x 8 x half>, <vscale x 8 x half>* %ptr, align 1
ret <vscale x 8 x half> %v
}
; CHECK: # %bb.0:
; CHECK-NEXT: vl2re16.v v8, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: aligned_load_nxv8f16_a2:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vl2re16.v v8, (a0)
+; UNALIGNED-NEXT: ret
%v = load <vscale x 8 x half>, <vscale x 8 x half>* %ptr, align 2
ret <vscale x 8 x half> %v
}
; CHECK: # %bb.0:
; CHECK-NEXT: vs2r.v v8, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: unaligned_store_nxv4i32_a1:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vs2r.v v8, (a0)
+; UNALIGNED-NEXT: ret
store <vscale x 4 x i32> %x, <vscale x 4 x i32>* %ptr, align 1
ret void
}
; CHECK: # %bb.0:
; CHECK-NEXT: vs2r.v v8, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: unaligned_store_nxv4i32_a2:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vs2r.v v8, (a0)
+; UNALIGNED-NEXT: ret
store <vscale x 4 x i32> %x, <vscale x 4 x i32>* %ptr, align 2
ret void
}
; CHECK: # %bb.0:
; CHECK-NEXT: vs2r.v v8, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: aligned_store_nxv4i32_a4:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vs2r.v v8, (a0)
+; UNALIGNED-NEXT: ret
store <vscale x 4 x i32> %x, <vscale x 4 x i32>* %ptr, align 4
ret void
}
; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: unaligned_store_nxv1i16_a1:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; UNALIGNED-NEXT: vse16.v v8, (a0)
+; UNALIGNED-NEXT: ret
store <vscale x 1 x i16> %x, <vscale x 1 x i16>* %ptr, align 1
ret void
}
; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
; CHECK-NEXT: vse16.v v8, (a0)
; CHECK-NEXT: ret
+;
+; UNALIGNED-LABEL: aligned_store_nxv1i16_a2:
+; UNALIGNED: # %bb.0:
+; UNALIGNED-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; UNALIGNED-NEXT: vse16.v v8, (a0)
+; UNALIGNED-NEXT: ret
store <vscale x 1 x i16> %x, <vscale x 1 x i16>* %ptr, align 2
ret void
}