// extract_vector_elt of out-of-bounds element -> UNDEF
auto *IndexC = dyn_cast<ConstantSDNode>(Index);
- unsigned NumElts = VecVT.getVectorNumElements();
- unsigned VecEltBitWidth = VecVT.getScalarSizeInBits();
- if (IndexC && IndexC->getAPIntValue().uge(NumElts))
+ if (IndexC && VecVT.isFixedLengthVector() &&
+ IndexC->getAPIntValue().uge(VecVT.getVectorNumElements()))
return DAG.getUNDEF(ScalarVT);
// extract_vector_elt (build_vector x, y), 1 -> y
- if (IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR &&
+ if (((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) ||
+ VecOp.getOpcode() == ISD::SPLAT_VECTOR) &&
TLI.isTypeLegal(VecVT) &&
(VecOp.hasOneUse() || TLI.aggressivelyPreferBuildVectorSources(VecVT))) {
- SDValue Elt = VecOp.getOperand(IndexC->getZExtValue());
+ assert((VecOp.getOpcode() != ISD::BUILD_VECTOR ||
+ VecVT.isFixedLengthVector()) &&
+ "BUILD_VECTOR used for scalable vectors");
+ unsigned IndexVal =
+ VecOp.getOpcode() == ISD::BUILD_VECTOR ? IndexC->getZExtValue() : 0;
+ SDValue Elt = VecOp.getOperand(IndexVal);
EVT InEltVT = Elt.getValueType();
// Sometimes build_vector's scalar input types do not match result type.
// converts.
}
+ if (VecVT.isScalableVector())
+ return SDValue();
+
+ // All the code from this point onwards assumes fixed width vectors, but it's
+ // possible that some of the combinations could be made to work for scalable
+ // vectors too.
+ unsigned NumElts = VecVT.getVectorNumElements();
+ unsigned VecEltBitWidth = VecVT.getScalarSizeInBits();
+
// TODO: These transforms should not require the 'hasOneUse' restriction, but
// there are regressions on multiple targets without it. We can end up with a
// mess of scalar and vector code if we reduce only part of the DAG to scalar.
#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
if (N1.isUndef() || N2.isUndef())
return getUNDEF(VT);
- // EXTRACT_VECTOR_ELT of out-of-bounds element is an UNDEF
- if (N2C && N2C->getAPIntValue().uge(N1.getValueType().getVectorNumElements()))
+ // EXTRACT_VECTOR_ELT of out-of-bounds element is an UNDEF for fixed length
+ // vectors. For scalable vectors we will provide appropriate support for
+ // dealing with arbitrary indices.
+ if (N2C && N1.getValueType().isFixedLengthVector() &&
+ N2C->getAPIntValue().uge(N1.getValueType().getVectorNumElements()))
return getUNDEF(VT);
// EXTRACT_VECTOR_ELT of CONCAT_VECTORS is often formed while lowering is
- // expanding copies of large vectors from registers.
- if (N2C &&
- N1.getOpcode() == ISD::CONCAT_VECTORS &&
- N1.getNumOperands() > 0) {
+ // expanding copies of large vectors from registers. This only works for
+ // fixed length vectors, since we need to know the exact number of
+ // elements.
+ if (N2C && N1.getOperand(0).getValueType().isFixedLengthVector() &&
+ N1.getOpcode() == ISD::CONCAT_VECTORS && N1.getNumOperands() > 0) {
unsigned Factor =
N1.getOperand(0).getValueType().getVectorNumElements();
return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
getVectorIdxConstant(N2C->getZExtValue() % Factor, DL));
}
- // EXTRACT_VECTOR_ELT of BUILD_VECTOR is often formed while lowering is
- // expanding large vector constants.
- if (N2C && N1.getOpcode() == ISD::BUILD_VECTOR) {
- SDValue Elt = N1.getOperand(N2C->getZExtValue());
+ // EXTRACT_VECTOR_ELT of BUILD_VECTOR or SPLAT_VECTOR is often formed while
+ // lowering is expanding large vector constants.
+ if (N2C && (N1.getOpcode() == ISD::BUILD_VECTOR ||
+ N1.getOpcode() == ISD::SPLAT_VECTOR)) {
+ assert((N1.getOpcode() != ISD::BUILD_VECTOR ||
+ N1.getValueType().isFixedLengthVector()) &&
+ "BUILD_VECTOR used for scalable vectors");
+ unsigned Index =
+ N1.getOpcode() == ISD::BUILD_VECTOR ? N2C->getZExtValue() : 0;
+ SDValue Elt = N1.getOperand(Index);
if (VT != Elt.getValueType())
// If the vector element type is not legal, the BUILD_VECTOR operands
// EXTRACT_VECTOR_ELT of v1iX EXTRACT_SUBVECTOR could be formed
// when vector types are scalarized and v1iX is legal.
- // vextract (v1iX extract_subvector(vNiX, Idx)) -> vextract(vNiX,Idx)
+ // vextract (v1iX extract_subvector(vNiX, Idx)) -> vextract(vNiX,Idx).
+ // Here we are completely ignoring the extract element index (N2),
+ // which is fine for fixed width vectors, since any index other than 0
+ // is undefined anyway. However, this cannot be ignored for scalable
+ // vectors - in theory we could support this, but we don't want to do this
+ // without a profitability check.
if (N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N1.getValueType().isFixedLengthVector() &&
N1.getValueType().getVectorNumElements() == 1) {
return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0),
N1.getOperand(1));
(INDEX_II_D 0, 1),
(DUP_ZR_D $index)),
$src)>;
+
+ // Extract element from vector with immediate index
+ def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_B ZPR:$vec, sve_elm_idx_extdup_b:$index), ssub)>;
+ def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), ssub)>;
+ def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
+ def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
+ def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
+ def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
+ def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
+
+ // Extract element from vector with scalar index
+ def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)),
+ (LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index),
+ ZPR:$vec)>;
+ def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)),
+ (LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
+ ZPR:$vec)>;
+ def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)),
+ (LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
+ ZPR:$vec)>;
+ def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)),
+ (LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
+ ZPR:$vec)>;
+
+ def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)),
+ (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
+ ZPR:$vec)>;
+ def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)),
+ (LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
+ ZPR:$vec)>;
+ def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)),
+ (LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
+ ZPR:$vec)>;
}
let Predicates = [HasSVE, HasMatMulInt8] in {
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define i8 @test_lane0_16xi8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: test_lane0_16xi8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.b, b0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 16 x i8> %a, i32 0
+ ret i8 %b
+}
+
+define i16 @test_lane0_8xi16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: test_lane0_8xi16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.h, h0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x i16> %a, i32 0
+ ret i16 %b
+}
+
+define i32 @test_lane0_4xi32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: test_lane0_4xi32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.s, s0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 4 x i32> %a, i32 0
+ ret i32 %b
+}
+
+define i64 @test_lane0_2xi64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: test_lane0_2xi64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.d, d0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 2 x i64> %a, i32 0
+ ret i64 %b
+}
+
+define double @test_lane0_2xf64(<vscale x 2 x double> %a) {
+; CHECK-LABEL: test_lane0_2xf64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 2 x double> %a, i32 0
+ ret double %b
+}
+
+define float @test_lane0_4xf32(<vscale x 4 x float> %a) {
+; CHECK-LABEL: test_lane0_4xf32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 4 x float> %a, i32 0
+ ret float %b
+}
+
+define half @test_lane0_8xf16(<vscale x 8 x half> %a) {
+; CHECK-LABEL: test_lane0_8xf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x half> %a, i32 0
+ ret half %b
+}
+
+define i8 @test_lanex_16xi8(<vscale x 16 x i8> %a, i32 %x) {
+; CHECK-LABEL: test_lanex_16xi8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: sxtw x8, w0
+; CHECK-NEXT: whilels p0.b, xzr, x8
+; CHECK-NEXT: lastb w0, p0, z0.b
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 16 x i8> %a, i32 %x
+ ret i8 %b
+}
+
+define i16 @test_lanex_8xi16(<vscale x 8 x i16> %a, i32 %x) {
+; CHECK-LABEL: test_lanex_8xi16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: sxtw x8, w0
+; CHECK-NEXT: whilels p0.h, xzr, x8
+; CHECK-NEXT: lastb w0, p0, z0.h
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x i16> %a, i32 %x
+ ret i16 %b
+}
+
+define i32 @test_lanex_4xi32(<vscale x 4 x i32> %a, i32 %x) {
+; CHECK-LABEL: test_lanex_4xi32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: sxtw x8, w0
+; CHECK-NEXT: whilels p0.s, xzr, x8
+; CHECK-NEXT: lastb w0, p0, z0.s
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 4 x i32> %a, i32 %x
+ ret i32 %b
+}
+
+define i64 @test_lanex_2xi64(<vscale x 2 x i64> %a, i32 %x) {
+; CHECK-LABEL: test_lanex_2xi64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: sxtw x8, w0
+; CHECK-NEXT: whilels p0.d, xzr, x8
+; CHECK-NEXT: lastb x0, p0, z0.d
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 2 x i64> %a, i32 %x
+ ret i64 %b
+}
+
+define double @test_lanex_2xf64(<vscale x 2 x double> %a, i32 %x) {
+; CHECK-LABEL: test_lanex_2xf64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: sxtw x8, w0
+; CHECK-NEXT: whilels p0.d, xzr, x8
+; CHECK-NEXT: lastb d0, p0, z0.d
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 2 x double> %a, i32 %x
+ ret double %b
+}
+
+define float @test_lanex_4xf32(<vscale x 4 x float> %a, i32 %x) {
+; CHECK-LABEL: test_lanex_4xf32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: sxtw x8, w0
+; CHECK-NEXT: whilels p0.s, xzr, x8
+; CHECK-NEXT: lastb s0, p0, z0.s
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 4 x float> %a, i32 %x
+ ret float %b
+}
+
+define half @test_lanex_8xf16(<vscale x 8 x half> %a, i32 %x) {
+; CHECK-LABEL: test_lanex_8xf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: sxtw x8, w0
+; CHECK-NEXT: whilels p0.h, xzr, x8
+; CHECK-NEXT: lastb h0, p0, z0.h
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x half> %a, i32 %x
+ ret half %b
+}
+
+; Deliberately choose an index that is out-of-bounds
+define i8 @test_lane64_16xi8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: test_lane64_16xi8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #64
+; CHECK-NEXT: whilels p0.b, xzr, x8
+; CHECK-NEXT: lastb w0, p0, z0.b
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 16 x i8> %a, i32 64
+ ret i8 %b
+}
+
+define double @test_lane9_2xf64(<vscale x 2 x double> %a) {
+; CHECK-LABEL: test_lane9_2xf64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #9
+; CHECK-NEXT: whilels p0.d, xzr, x8
+; CHECK-NEXT: lastb d0, p0, z0.d
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 2 x double> %a, i32 9
+ ret double %b
+}
+
+; Deliberately choose an index that is undefined
+define i32 @test_lane64_4xi32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: test_lane64_4xi32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.s, s0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 4 x i32> %a, i32 undef
+ ret i32 %b
+}
+
+define i8 @extract_of_insert_undef_16xi8(i8 %a) {
+; CHECK-LABEL: extract_of_insert_undef_16xi8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ %b = insertelement <vscale x 16 x i8> undef, i8 %a, i32 0
+ %c = extractelement <vscale x 16 x i8> %b, i32 0
+ ret i8 %c
+}
+
+define i8 @extract0_of_insert0_16xi8(<vscale x 16 x i8> %a, i8 %b) {
+; CHECK-LABEL: extract0_of_insert0_16xi8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ %c = insertelement <vscale x 16 x i8> %a, i8 %b, i32 0
+ %d = extractelement <vscale x 16 x i8> %c, i32 0
+ ret i8 %d
+}
+
+define i8 @extract64_of_insert64_16xi8(<vscale x 16 x i8> %a, i8 %b) {
+; CHECK-LABEL: extract64_of_insert64_16xi8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ %c = insertelement <vscale x 16 x i8> %a, i8 %b, i32 64
+ %d = extractelement <vscale x 16 x i8> %c, i32 64
+ ret i8 %d
+}
+
+define i8 @extract_of_insert_diff_lanes_16xi8(<vscale x 16 x i8> %a, i8 %b) {
+; CHECK-LABEL: extract_of_insert_diff_lanes_16xi8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.b, z0.b[3]
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %c = insertelement <vscale x 16 x i8> %a, i8 %b, i32 0
+ %d = extractelement <vscale x 16 x i8> %c, i32 3
+ ret i8 %d
+}
+
+define i8 @test_lane0_zero_16xi8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: test_lane0_zero_16xi8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 16 x i8> zeroinitializer, i32 0
+ ret i8 %b
+}
+
+; The DAG combiner should fold the extract of a splat to give element zero
+; of the splat, i.e. %x. If the index is beyond the end of the scalable
+; vector the result is undefined anyway.
+define i64 @test_lanex_splat_2xi64(i64 %x, i32 %y) {
+; CHECK-LABEL: test_lanex_splat_2xi64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ %a = insertelement <vscale x 2 x i64> undef, i64 %x, i32 0
+ %b = shufflevector <vscale x 2 x i64> %a, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+ %c = extractelement <vscale x 2 x i64> %b, i32 %y
+ ret i64 %c
+}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
define <vscale x 16 x i8> @test_lane0_16xi8(<vscale x 16 x i8> %a) {
-; CHECK-LABEL: test_lane0_16xi8
-; CHECK: mov [[REG:.*]], #30
-; CHECK: mov z0.b, p{{[0-7]}}/m, [[REG]]
+; CHECK-LABEL: test_lane0_16xi8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b, vl1
+; CHECK-NEXT: mov w8, #30
+; CHECK-NEXT: mov z0.b, p0/m, w8
+; CHECK-NEXT: ret
%b = insertelement <vscale x 16 x i8> %a, i8 30, i32 0
ret <vscale x 16 x i8> %b
}
define <vscale x 8 x i16> @test_lane0_8xi16(<vscale x 8 x i16> %a) {
-; CHECK-LABEL: test_lane0_8xi16
-; CHECK: mov [[REG:.*]], #30
-; CHECK: mov z0.h, p{{[0-7]}}/m, [[REG]]
+; CHECK-LABEL: test_lane0_8xi16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h, vl1
+; CHECK-NEXT: mov w8, #30
+; CHECK-NEXT: mov z0.h, p0/m, w8
+; CHECK-NEXT: ret
%b = insertelement <vscale x 8 x i16> %a, i16 30, i32 0
ret <vscale x 8 x i16> %b
}
define <vscale x 4 x i32> @test_lane0_4xi32(<vscale x 4 x i32> %a) {
-; CHECK-LABEL: test_lane0_4xi32
-; CHECK: mov [[REG:.*]], #30
-; CHECK: mov z0.s, p{{[0-7]}}/m, [[REG]]
+; CHECK-LABEL: test_lane0_4xi32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s, vl1
+; CHECK-NEXT: mov w8, #30
+; CHECK-NEXT: mov z0.s, p0/m, w8
+; CHECK-NEXT: ret
%b = insertelement <vscale x 4 x i32> %a, i32 30, i32 0
ret <vscale x 4 x i32> %b
}
define <vscale x 2 x i64> @test_lane0_2xi64(<vscale x 2 x i64> %a) {
-; CHECK-LABEL: test_lane0_2xi64
-; CHECK: mov w[[REG:.*]], #30
-; CHECK: mov z0.d, p{{[0-7]}}/m, x[[REG]]
+; CHECK-LABEL: test_lane0_2xi64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: mov w8, #30
+; CHECK-NEXT: mov z0.d, p0/m, x8
+; CHECK-NEXT: ret
%b = insertelement <vscale x 2 x i64> %a, i64 30, i32 0
ret <vscale x 2 x i64> %b
}
define <vscale x 2 x double> @test_lane0_2xf64(<vscale x 2 x double> %a) {
-; CHECK-LABEL: test_lane0_2xf64
-; CHECK: fmov d[[REG:[0-9]+]], #1.00000000
-; CHECK: mov z0.d, p{{[0-7]}}/m, z[[REG]].d
+; CHECK-LABEL: test_lane0_2xf64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov d1, #1.00000000
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: mov z0.d, p0/m, z1.d
+; CHECK-NEXT: ret
%b = insertelement <vscale x 2 x double> %a, double 1.0, i32 0
ret <vscale x 2 x double> %b
}
define <vscale x 4 x float> @test_lane0_4xf32(<vscale x 4 x float> %a) {
-; CHECK-LABEL: test_lane0_4xf32
-; CHECK: fmov s[[REG:[0-9]+]], #1.00000000
-; CHECK: mov z0.s, p{{[0-7]}}/m, z[[REG]].s
+; CHECK-LABEL: test_lane0_4xf32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s1, #1.00000000
+; CHECK-NEXT: ptrue p0.s, vl1
+; CHECK-NEXT: mov z0.s, p0/m, z1.s
+; CHECK-NEXT: ret
%b = insertelement <vscale x 4 x float> %a, float 1.0, i32 0
ret <vscale x 4 x float> %b
}
define <vscale x 8 x half> @test_lane0_8xf16(<vscale x 8 x half> %a) {
-; CHECK-LABEL: test_lane0_8xf16
-; CHECK: fmov h[[REG:[0-9]+]], #1.00000000
-; CHECK: mov z0.h, p{{[0-7]}}/m, z[[REG]].h
+; CHECK-LABEL: test_lane0_8xf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov h1, #1.00000000
+; CHECK-NEXT: ptrue p0.h, vl1
+; CHECK-NEXT: mov z0.h, p0/m, z1.h
+; CHECK-NEXT: ret
%b = insertelement <vscale x 8 x half> %a, half 1.0, i32 0
ret <vscale x 8 x half> %b
}
; Undefined lane insert
define <vscale x 2 x i64> @test_lane4_2xi64(<vscale x 2 x i64> %a) {
-; CHECK-LABEL: test_lane4_2xi64
-; CHECK: mov w[[IDXREG:.*]], #4
-; CHECK: index z[[CMPVEC:[0-9]+]].d, #0, #1
-; CHECK: mov z[[IDXVEC:[0-9]+]].d, x[[IDXREG]]
-; CHECK: cmpeq p[[PRED:[0-9]+]].d, p{{[0-7]}}/z, z[[CMPVEC]].d, z[[IDXVEC]].d
-; CHECK: mov w[[VALREG:.*]], #30
-; CHECK: mov z0.d, p[[PRED]]/m, x[[VALREG]]
+; CHECK-LABEL: test_lane4_2xi64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #4
+; CHECK-NEXT: index z1.d, #0, #1
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: mov z2.d, x8
+; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d
+; CHECK-NEXT: mov w8, #30
+; CHECK-NEXT: mov z0.d, p0/m, x8
+; CHECK-NEXT: ret
%b = insertelement <vscale x 2 x i64> %a, i64 30, i32 4
ret <vscale x 2 x i64> %b
}
; Undefined lane insert
define <vscale x 8 x half> @test_lane9_8xf16(<vscale x 8 x half> %a) {
-; CHECK-LABEL: test_lane9_8xf16
-; CHECK: mov w[[IDXREG:.*]], #9
-; CHECK: index z[[CMPVEC:[0-9]+]].h, #0, #1
-; CHECK: mov z[[IDXVEC:[0-9]+]].h, w[[IDXREG]]
-; CHECK: cmpeq p[[PRED:[0-9]+]].h, p{{[0-7]}}/z, z[[CMPVEC]].h, z[[IDXVEC]].h
-; CHECK: fmov h[[VALREG:[0-9]+]], #1.00000000
-; CHECK: mov z0.h, p[[PRED]]/m, h[[VALREG]]
+; CHECK-LABEL: test_lane9_8xf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #9
+; CHECK-NEXT: index z1.h, #0, #1
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: mov z2.h, w8
+; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z2.h
+; CHECK-NEXT: fmov h1, #1.00000000
+; CHECK-NEXT: mov z0.h, p0/m, h1
+; CHECK-NEXT: ret
%b = insertelement <vscale x 8 x half> %a, half 1.0, i32 9
ret <vscale x 8 x half> %b
}
define <vscale x 16 x i8> @test_lane1_16xi8(<vscale x 16 x i8> %a) {
-; CHECK-LABEL: test_lane1_16xi8
-; CHECK: mov w[[IDXREG:.*]], #1
-; CHECK: index z[[CMPVEC:[0-9]+]].b, #0, #1
-; CHECK: mov z[[IDXVEC:[0-9]+]].b, w[[IDXREG]]
-; CHECK: cmpeq p[[PRED:[0-9]+]].b, p{{[0-7]}}/z, z[[CMPVEC]].b, z[[IDXVEC]].b
-; CHECK: mov w[[VALREG:.*]], #30
-; CHECK: mov z0.b, p[[PRED]]/m, w[[VALREG]]
+; CHECK-LABEL: test_lane1_16xi8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #1
+; CHECK-NEXT: index z1.b, #0, #1
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: mov z2.b, w8
+; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, z2.b
+; CHECK-NEXT: mov w8, #30
+; CHECK-NEXT: mov z0.b, p0/m, w8
+; CHECK-NEXT: ret
%b = insertelement <vscale x 16 x i8> %a, i8 30, i32 1
ret <vscale x 16 x i8> %b
}
define <vscale x 16 x i8> @test_lanex_16xi8(<vscale x 16 x i8> %a, i32 %x) {
-; CHECK-LABEL: test_lanex_16xi8
-; CHECK: index z[[CMPVEC:[0-9]+]].b, #0, #1
-; CHECK: mov z[[IDXVEC:[0-9]+]].b, w[[IDXREG]]
-; CHECK: cmpeq p[[PRED:[0-9]+]].b, p{{[0-7]}}/z, z[[CMPVEC]].b, z[[IDXVEC]].b
-; CHECK: mov w[[VALREG:.*]], #30
-; CHECK: mov z0.b, p[[PRED]]/m, w[[VALREG]]
+; CHECK-LABEL: test_lanex_16xi8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: sxtw x8, w0
+; CHECK-NEXT: index z1.b, #0, #1
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: mov z2.b, w8
+; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, z2.b
+; CHECK-NEXT: mov w8, #30
+; CHECK-NEXT: mov z0.b, p0/m, w8
+; CHECK-NEXT: ret
%b = insertelement <vscale x 16 x i8> %a, i8 30, i32 %x
ret <vscale x 16 x i8> %b
}
; Redundant lane insert
define <vscale x 4 x i32> @extract_insert_4xi32(<vscale x 4 x i32> %a) {
-; CHECK-LABEL: extract_insert_4xi32
-; CHECK-NOT: mov w{{.*}}, #30
-; CHECK-NOT: mov z0.d
+; CHECK-LABEL: extract_insert_4xi32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
%b = extractelement <vscale x 4 x i32> %a, i32 2
%c = insertelement <vscale x 4 x i32> %a, i32 %b, i32 2
ret <vscale x 4 x i32> %c
}
define <vscale x 8 x i16> @test_lane6_undef_8xi16(i16 %a) {
-; CHECK-LABEL: test_lane6_undef_8xi16
-; CHECK: mov w[[IDXREG:.*]], #6
-; CHECK: index z[[CMPVEC:.*]].h, #0, #1
-; CHECK: mov z[[IDXVEC:[0-9]+]].h, w[[IDXREG]]
-; CHECK: cmpeq p[[PRED:.*]].h, p{{.*}}/z, z[[CMPVEC]].h, z[[IDXVEC]].h
-; CHECK: mov z0.h, p[[PRED]]/m, w0
+; CHECK-LABEL: test_lane6_undef_8xi16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #6
+; CHECK-NEXT: index z0.h, #0, #1
+; CHECK-NEXT: mov z1.h, w8
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT: mov z0.h, p0/m, w0
+; CHECK-NEXT: ret
%b = insertelement <vscale x 8 x i16> undef, i16 %a, i32 6
ret <vscale x 8 x i16> %b
}
define <vscale x 16 x i8> @test_lane0_undef_16xi8(i8 %a) {
-; CHECK-LABEL: test_lane0_undef_16xi8
-; CHECK: fmov s0, w0
+; CHECK-LABEL: test_lane0_undef_16xi8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ret
%b = insertelement <vscale x 16 x i8> undef, i8 %a, i32 0
ret <vscale x 16 x i8> %b
}
+
+define <vscale x 16 x i8> @test_insert0_of_extract0_16xi8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_insert0_of_extract0_16xi8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.b, b1
+; CHECK-NEXT: ptrue p0.b, vl1
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: mov z0.b, p0/m, w8
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 16 x i8> %b, i32 0
+ %d = insertelement <vscale x 16 x i8> %a, i8 %c, i32 0
+ ret <vscale x 16 x i8> %d
+}
+
+define <vscale x 16 x i8> @test_insert64_of_extract64_16xi8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_insert64_of_extract64_16xi8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #64
+; CHECK-NEXT: index z2.b, #0, #1
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: whilels p1.b, xzr, x8
+; CHECK-NEXT: mov z3.b, w8
+; CHECK-NEXT: lastb w8, p1, z1.b
+; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b
+; CHECK-NEXT: mov z0.b, p0/m, w8
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 16 x i8> %b, i32 64
+ %d = insertelement <vscale x 16 x i8> %a, i8 %c, i32 64
+ ret <vscale x 16 x i8> %d
+}
+
+define <vscale x 16 x i8> @test_insert3_of_extract1_16xi8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_insert3_of_extract1_16xi8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.b, z1.b[1]
+; CHECK-NEXT: mov w8, #3
+; CHECK-NEXT: index z2.b, #0, #1
+; CHECK-NEXT: fmov w9, s1
+; CHECK-NEXT: mov z1.b, w8
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z1.b
+; CHECK-NEXT: mov z0.b, p0/m, w9
+; CHECK-NEXT: ret
+ %c = extractelement <vscale x 16 x i8> %b, i32 1
+ %d = insertelement <vscale x 16 x i8> %a, i8 %c, i32 3
+ ret <vscale x 16 x i8> %d
+}