if (SrcEltTy == SmallestEltTy)
continue;
assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
- Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
+ Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
Src.WindowBase *= Src.WindowScale;
}
ShuffleOps[1], Mask, DAG);
if (!Shuffle)
return SDValue();
- return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+ return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
}
enum ShuffleOpCodes {
return SDValue();
}
+static SDValue
+PerformVECTOR_REG_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *ST) {
+ EVT VT = N->getValueType(0);
+ SDValue Op = N->getOperand(0);
+ SDLoc dl(N);
+
+ // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
+ if (ST->isLittle())
+ return DCI.DAG.getNode(ISD::BITCAST, dl, VT, Op);
+
+ // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
+ if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
+ // If the valuetypes are the same, we can remove the cast entirely.
+ if (Op->getOperand(0).getValueType() == VT)
+ return Op->getOperand(0);
+ return DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
+ }
+
+ return SDValue();
+}
+
static SDValue PerformVCMPCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
return PerformARMBUILD_VECTORCombine(N, DCI);
case ARMISD::PREDICATE_CAST:
return PerformPREDICATE_CASTCombine(N, DCI);
+ case ARMISD::VECTOR_REG_CAST:
+ return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget);
case ARMISD::VCMP:
return PerformVCMPCombine(N, DCI, Subtarget);
case ARMISD::SMULWB: {
def ARMLE : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>;
def ARMLoopDec : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>;
+// 'VECTOR_REG_CAST' is an operation that reinterprets the contents of a
+// vector register as a different vector type, without changing the contents of
+// the register. It differs from 'bitconvert' in that bitconvert reinterprets
+// the _memory_ storage format of the vector, whereas VECTOR_REG_CAST
+// reinterprets the _register_ format - and in big-endian, the memory and
+// register formats are different, so they are different operations.
+//
+// For example, 'VECTOR_REG_CAST' between v8i16 and v16i8 will map the LSB of
+// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness,
+// whereas 'bitconvert' will map it to the high byte in big-endian mode,
+// because that's what (MVE) VSTRH.16 followed by VLDRB.8 would do. So the
+// bitconvert would have to emit a VREV16.8 instruction, whereas the
+// VECTOR_REG_CAST emits no code at all if the vector is already in a register.
+def ARMVectorRegCast : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>;
+
//===----------------------------------------------------------------------===//
// ARM Flag Definitions.
// vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles.
def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>;
-// 'vector_reg_cast' is an operation that reinterprets the contents of an MVE
-// vector register as a different vector type, without changing the contents of
-// the register. It differs from 'bitconvert' in that bitconvert reinterprets
-// the _memory_ storage format of the vector, whereas vector_reg_cast
-// reinterprets the _register_ format - and in big-endian, the memory and
-// register formats are different, so they are different operations.
-//
-// For example, 'vector_reg_cast' between v8i16 and v16i8 will map the LSB of
-// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness,
-// whereas 'bitconvert' will map it to the high byte in big-endian mode,
-// because that's what VSTRH.16 followed by VLDRB.8 would do. So the bitconvert
-// would have to emit a VREV16.8 instruction, whereas the vector_reg_cast emits
-// no code at all if the vector is already in a register.
-def vector_reg_cast : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>;
-
let Predicates = [HasMVEInt] in {
foreach VT = [ v4i1, v8i1, v16i1 ] in {
def : Pat<(i32 (predicate_cast (VT VCCR:$src))),
foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
- def : Pat<(VT (vector_reg_cast (VT2 MQPR:$src))), (VT MQPR:$src)>;
+ def : Pat<(VT (ARMVectorRegCast (VT2 MQPR:$src))), (VT MQPR:$src)>;
}
// end of MVE compares
def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8 QPR:$src)>;
}
+let Predicates = [HasNEON] in {
+ foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
+ foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
+ def : Pat<(VT (ARMVectorRegCast (VT2 QPR:$src))), (VT QPR:$src)>;
+
+ foreach VT = [ v8i8, v4i16, v4f16, v2i32, v2f32, v1i64, f64 ] in
+ foreach VT2 = [ v8i8, v4i16, v4f16, v2i32, v2f32, v1i64, f64 ] in
+ def : Pat<(VT (ARMVectorRegCast (VT2 DPR:$src))), (VT DPR:$src)>;
+}
+
// Use VLD1/VST1 + VREV for non-word-aligned v2f64 load/store on Big Endian
let Predicates = [IsBE,HasNEON] in {
def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
;
; CHECKBE-LABEL: test:
; CHECKBE: @ %bb.0: @ %entry
-; CHECKBE-NEXT: vrev64.8 q8, q1
-; CHECKBE-NEXT: vrev64.8 q9, q0
+; CHECKBE-NEXT: vrev64.16 q8, q1
+; CHECKBE-NEXT: vrev64.16 q9, q0
; CHECKBE-NEXT: vtrn.8 q9, q8
; CHECKBE-NEXT: vrev64.8 q0, q9
; CHECKBE-NEXT: bx lr
;
; CHECKBE-LABEL: vmovn32_trunc1:
; CHECKBE: @ %bb.0: @ %entry
-; CHECKBE-NEXT: vrev64.16 q2, q1
-; CHECKBE-NEXT: vrev64.16 q1, q0
+; CHECKBE-NEXT: vrev64.32 q2, q1
+; CHECKBE-NEXT: vrev64.32 q1, q0
; CHECKBE-NEXT: vmovnt.i32 q1, q2
; CHECKBE-NEXT: vrev64.16 q0, q1
; CHECKBE-NEXT: bx lr
;
; CHECKBE-LABEL: vmovn32_trunc2:
; CHECKBE: @ %bb.0: @ %entry
-; CHECKBE-NEXT: vrev64.16 q2, q0
-; CHECKBE-NEXT: vrev64.16 q3, q1
+; CHECKBE-NEXT: vrev64.32 q2, q0
+; CHECKBE-NEXT: vrev64.32 q3, q1
; CHECKBE-NEXT: vmovnt.i32 q3, q2
; CHECKBE-NEXT: vrev64.16 q0, q3
; CHECKBE-NEXT: bx lr
;
; CHECKBE-LABEL: vmovn16_trunc1:
; CHECKBE: @ %bb.0: @ %entry
-; CHECKBE-NEXT: vrev64.8 q2, q1
-; CHECKBE-NEXT: vrev64.8 q1, q0
+; CHECKBE-NEXT: vrev64.16 q2, q1
+; CHECKBE-NEXT: vrev64.16 q1, q0
; CHECKBE-NEXT: vmovnt.i16 q1, q2
; CHECKBE-NEXT: vrev64.8 q0, q1
; CHECKBE-NEXT: bx lr
;
; CHECKBE-LABEL: vmovn16_trunc2:
; CHECKBE: @ %bb.0: @ %entry
-; CHECKBE-NEXT: vrev64.8 q2, q0
-; CHECKBE-NEXT: vrev64.8 q3, q1
+; CHECKBE-NEXT: vrev64.16 q2, q0
+; CHECKBE-NEXT: vrev64.16 q3, q1
; CHECKBE-NEXT: vmovnt.i16 q3, q2
; CHECKBE-NEXT: vrev64.8 q0, q3
; CHECKBE-NEXT: bx lr