}
}
+ auto *Load = cast<MemIntrinsicSDNode>(Op);
SDValue VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
+ SDValue Ptr = Op.getOperand(3);
+ SDValue Stride = Op.getOperand(4);
+ SDValue Result, Chain;
+
+ // TODO: We restrict this to unmasked loads currently in consideration of
+ // the complexity of hanlding all falses masks.
+ if (IsUnmasked && isNullConstant(Stride) &&
+ !Subtarget.hasOptimizedZeroStrideLoad()) {
+ MVT ScalarVT = ContainerVT.getVectorElementType();
+ SDValue ScalarLoad =
+ DAG.getExtLoad(ISD::ZEXTLOAD, DL, XLenVT, Load->getChain(), Ptr,
+ ScalarVT, Load->getMemOperand());
+ Chain = ScalarLoad.getValue(1);
+ Result = lowerScalarSplat(SDValue(), ScalarLoad, VL, ContainerVT, DL, DAG,
+ Subtarget);
+ } else {
+ SDValue IntID = DAG.getTargetConstant(
+ IsUnmasked ? Intrinsic::riscv_vlse : Intrinsic::riscv_vlse_mask, DL,
+ XLenVT);
- SDValue IntID = DAG.getTargetConstant(
- IsUnmasked ? Intrinsic::riscv_vlse : Intrinsic::riscv_vlse_mask, DL,
- XLenVT);
+ SmallVector<SDValue, 8> Ops{Load->getChain(), IntID};
+ if (IsUnmasked)
+ Ops.push_back(DAG.getUNDEF(ContainerVT));
+ else
+ Ops.push_back(PassThru);
+ Ops.push_back(Ptr);
+ Ops.push_back(Stride);
+ if (!IsUnmasked)
+ Ops.push_back(Mask);
+ Ops.push_back(VL);
+ if (!IsUnmasked) {
+ SDValue Policy =
+ DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT);
+ Ops.push_back(Policy);
+ }
- auto *Load = cast<MemIntrinsicSDNode>(Op);
- SmallVector<SDValue, 8> Ops{Load->getChain(), IntID};
- if (IsUnmasked)
- Ops.push_back(DAG.getUNDEF(ContainerVT));
- else
- Ops.push_back(PassThru);
- Ops.push_back(Op.getOperand(3)); // Ptr
- Ops.push_back(Op.getOperand(4)); // Stride
- if (!IsUnmasked)
- Ops.push_back(Mask);
- Ops.push_back(VL);
- if (!IsUnmasked) {
- SDValue Policy = DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT);
- Ops.push_back(Policy);
+ SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
+ Result =
+ DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
+ Load->getMemoryVT(), Load->getMemOperand());
+ Chain = Result.getValue(1);
}
-
- SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
- SDValue Result =
- DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
- Load->getMemoryVT(), Load->getMemOperand());
- SDValue Chain = Result.getValue(1);
if (VT.isFixedLengthVector())
Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
return DAG.getMergeValues({Result, Chain}, DL);
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,V
; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,ZVE32F
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+no-optimized-zero-stride-load -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,NOT-OPTIMIZED
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+no-optimized-zero-stride-load -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,NOT-OPTIMIZED
%struct.foo = type { i32, i32, i32, i32 }
define void @gather_zero_stride(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
;
-; CHECK-LABEL: gather_zero_stride:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: li a2, 0
-; CHECK-NEXT: li a3, 32
-; CHECK-NEXT: li a4, 1024
-; CHECK-NEXT: .LBB3_1: # %vector.body
-; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-NEXT: vlse8.v v8, (a1), zero
-; CHECK-NEXT: add a5, a0, a2
-; CHECK-NEXT: vle8.v v9, (a5)
-; CHECK-NEXT: vadd.vv v8, v9, v8
-; CHECK-NEXT: vse8.v v8, (a5)
-; CHECK-NEXT: addi a2, a2, 32
-; CHECK-NEXT: addi a1, a1, 160
-; CHECK-NEXT: bne a2, a4, .LBB3_1
-; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
-; CHECK-NEXT: ret
+; V-LABEL: gather_zero_stride:
+; V: # %bb.0: # %entry
+; V-NEXT: li a2, 0
+; V-NEXT: li a3, 32
+; V-NEXT: li a4, 1024
+; V-NEXT: .LBB3_1: # %vector.body
+; V-NEXT: # =>This Inner Loop Header: Depth=1
+; V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
+; V-NEXT: vlse8.v v8, (a1), zero
+; V-NEXT: add a5, a0, a2
+; V-NEXT: vle8.v v9, (a5)
+; V-NEXT: vadd.vv v8, v9, v8
+; V-NEXT: vse8.v v8, (a5)
+; V-NEXT: addi a2, a2, 32
+; V-NEXT: addi a1, a1, 160
+; V-NEXT: bne a2, a4, .LBB3_1
+; V-NEXT: # %bb.2: # %for.cond.cleanup
+; V-NEXT: ret
+;
+; ZVE32F-LABEL: gather_zero_stride:
+; ZVE32F: # %bb.0: # %entry
+; ZVE32F-NEXT: li a2, 0
+; ZVE32F-NEXT: li a3, 32
+; ZVE32F-NEXT: li a4, 1024
+; ZVE32F-NEXT: .LBB3_1: # %vector.body
+; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1
+; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma
+; ZVE32F-NEXT: vlse8.v v8, (a1), zero
+; ZVE32F-NEXT: add a5, a0, a2
+; ZVE32F-NEXT: vle8.v v9, (a5)
+; ZVE32F-NEXT: vadd.vv v8, v9, v8
+; ZVE32F-NEXT: vse8.v v8, (a5)
+; ZVE32F-NEXT: addi a2, a2, 32
+; ZVE32F-NEXT: addi a1, a1, 160
+; ZVE32F-NEXT: bne a2, a4, .LBB3_1
+; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup
+; ZVE32F-NEXT: ret
+;
+; NOT-OPTIMIZED-LABEL: gather_zero_stride:
+; NOT-OPTIMIZED: # %bb.0: # %entry
+; NOT-OPTIMIZED-NEXT: li a2, 0
+; NOT-OPTIMIZED-NEXT: li a3, 32
+; NOT-OPTIMIZED-NEXT: li a4, 1024
+; NOT-OPTIMIZED-NEXT: .LBB3_1: # %vector.body
+; NOT-OPTIMIZED-NEXT: # =>This Inner Loop Header: Depth=1
+; NOT-OPTIMIZED-NEXT: lbu a5, 0(a1)
+; NOT-OPTIMIZED-NEXT: add a6, a0, a2
+; NOT-OPTIMIZED-NEXT: vsetvli zero, a3, e8, m1, ta, ma
+; NOT-OPTIMIZED-NEXT: vle8.v v8, (a6)
+; NOT-OPTIMIZED-NEXT: vadd.vx v8, v8, a5
+; NOT-OPTIMIZED-NEXT: vse8.v v8, (a6)
+; NOT-OPTIMIZED-NEXT: addi a2, a2, 32
+; NOT-OPTIMIZED-NEXT: addi a1, a1, 160
+; NOT-OPTIMIZED-NEXT: bne a2, a4, .LBB3_1
+; NOT-OPTIMIZED-NEXT: # %bb.2: # %for.cond.cleanup
+; NOT-OPTIMIZED-NEXT: ret
entry:
br label %vector.body