[ARM] Fix ReconstructShuffle for bigendian

author David Green <david.green@arm.com>

Wed, 12 Feb 2020 20:51:39 +0000 (20:51 +0000)

committer David Green <david.green@arm.com>

Thu, 13 Feb 2020 09:56:46 +0000 (09:56 +0000)
author David Green <david.green@arm.com>
Wed, 12 Feb 2020 20:51:39 +0000 (20:51 +0000)
committer David Green <david.green@arm.com>
Thu, 13 Feb 2020 09:56:46 +0000 (09:56 +0000)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp

index f168eb1..e42f4de 100644 (file)
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -7526,7 +7526,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
      if (SrcEltTy == SmallestEltTy)
        continue;
      assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
-    Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
+    Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
      Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
      Src.WindowBase *= Src.WindowScale;
    }
@@ -7578,7 +7578,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
                                              ShuffleOps[1], Mask, DAG);
    if (!Shuffle)
      return SDValue();
-  return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+  return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
  }
  
  enum ShuffleOpCodes {
@@ -12952,6 +12952,28 @@ PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
    return SDValue();
  }
  
+static SDValue
+PerformVECTOR_REG_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                              const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  SDLoc dl(N);
+
+  // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
+  if (ST->isLittle())
+    return DCI.DAG.getNode(ISD::BITCAST, dl, VT, Op);
+
+  // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
+  if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
+    // If the valuetypes are the same, we can remove the cast entirely.
+    if (Op->getOperand(0).getValueType() == VT)
+      return Op->getOperand(0);
+    return DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
+  }
+
+  return SDValue();
+}
+
  static SDValue PerformVCMPCombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    const ARMSubtarget *Subtarget) {
@@ -14787,6 +14809,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
      return PerformARMBUILD_VECTORCombine(N, DCI);
    case ARMISD::PREDICATE_CAST:
      return PerformPREDICATE_CASTCombine(N, DCI);
+  case ARMISD::VECTOR_REG_CAST:
+    return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget);
    case ARMISD::VCMP:
      return PerformVCMPCombine(N, DCI, Subtarget);
    case ARMISD::SMULWB: {
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td

index 672dfca..2674cc1 100644 (file)
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -294,6 +294,21 @@ def ARMWLS      : SDNode<"ARMISD::WLS", SDT_ARMLoLoop, [SDNPHasChain]>;
  def ARMLE       : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>;
  def ARMLoopDec  : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>;
  
+// 'VECTOR_REG_CAST' is an operation that reinterprets the contents of a
+// vector register as a different vector type, without changing the contents of
+// the register. It differs from 'bitconvert' in that bitconvert reinterprets
+// the _memory_ storage format of the vector, whereas VECTOR_REG_CAST
+// reinterprets the _register_ format - and in big-endian, the memory and
+// register formats are different, so they are different operations.
+//
+// For example, 'VECTOR_REG_CAST' between v8i16 and v16i8 will map the LSB of
+// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness,
+// whereas 'bitconvert' will map it to the high byte in big-endian mode,
+// because that's what (MVE) VSTRH.16 followed by VLDRB.8 would do. So the
+// bitconvert would have to emit a VREV16.8 instruction, whereas the
+// VECTOR_REG_CAST emits no code at all if the vector is already in a register.
+def ARMVectorRegCast : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>;
+
  //===----------------------------------------------------------------------===//
  // ARM Flag Definitions.
  
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td

index 2b95bce..8a03c50 100644 (file)
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -3990,21 +3990,6 @@ let Predicates = [HasMVEInt] in {
  // vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles.
  def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>;
  
-// 'vector_reg_cast' is an operation that reinterprets the contents of an MVE
-// vector register as a different vector type, without changing the contents of
-// the register. It differs from 'bitconvert' in that bitconvert reinterprets
-// the _memory_ storage format of the vector, whereas vector_reg_cast
-// reinterprets the _register_ format - and in big-endian, the memory and
-// register formats are different, so they are different operations.
-//
-// For example, 'vector_reg_cast' between v8i16 and v16i8 will map the LSB of
-// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness,
-// whereas 'bitconvert' will map it to the high byte in big-endian mode,
-// because that's what VSTRH.16 followed by VLDRB.8 would do. So the bitconvert
-// would have to emit a VREV16.8 instruction, whereas the vector_reg_cast emits
-// no code at all if the vector is already in a register.
-def vector_reg_cast : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>;
-
  let Predicates = [HasMVEInt] in {
    foreach VT = [ v4i1, v8i1, v16i1 ] in {
      def : Pat<(i32 (predicate_cast (VT VCCR:$src))),
@@ -4019,7 +4004,7 @@ let Predicates = [HasMVEInt] in {
  
    foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
      foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
-      def : Pat<(VT (vector_reg_cast (VT2 MQPR:$src))), (VT MQPR:$src)>;
+      def : Pat<(VT (ARMVectorRegCast (VT2 MQPR:$src))), (VT MQPR:$src)>;
  }
  
  // end of MVE compares
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td

index a20acd8..d0801b8 100644 (file)
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -7531,6 +7531,16 @@ let Predicates = [IsBE,HasNEON] in {
    def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8  QPR:$src)>;
  }
  
+let Predicates = [HasNEON] in {
+  foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
+    foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
+      def : Pat<(VT (ARMVectorRegCast (VT2 QPR:$src))), (VT QPR:$src)>;
+
+  foreach VT = [ v8i8, v4i16, v4f16, v2i32, v2f32, v1i64, f64 ] in
+    foreach VT2 = [ v8i8, v4i16, v4f16, v2i32, v2f32, v1i64, f64 ] in
+      def : Pat<(VT (ARMVectorRegCast (VT2 DPR:$src))), (VT DPR:$src)>;
+}
+
  // Use VLD1/VST1 + VREV for non-word-aligned v2f64 load/store on Big Endian
  let Predicates = [IsBE,HasNEON] in {
  def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
diff --git a/llvm/test/CodeGen/ARM/neon-vmovn.ll b/llvm/test/CodeGen/ARM/neon-vmovn.ll

index 675a38a..c16eace 100644 (file)
--- a/llvm/test/CodeGen/ARM/neon-vmovn.ll
+++ b/llvm/test/CodeGen/ARM/neon-vmovn.ll
@@ -732,8 +732,8 @@ define arm_aapcs_vfpcc <16 x i8> @test(<8 x i16> %src1, <8 x i16> %src2) {
  ;
  ; CHECKBE-LABEL: test:
  ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.8 q8, q1
-; CHECKBE-NEXT:    vrev64.8 q9, q0
+; CHECKBE-NEXT:    vrev64.16 q8, q1
+; CHECKBE-NEXT:    vrev64.16 q9, q0
  ; CHECKBE-NEXT:    vtrn.8 q9, q8
  ; CHECKBE-NEXT:    vrev64.8 q0, q9
  ; CHECKBE-NEXT:    bx lr
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll

index ec34bdd..0a48179 100644 (file)
--- a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
@@ -10,8 +10,8 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc1(<4 x i32> %src1, <4 x i32> %src
  ;
  ; CHECKBE-LABEL: vmovn32_trunc1:
  ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.16 q2, q1
-; CHECKBE-NEXT:    vrev64.16 q1, q0
+; CHECKBE-NEXT:    vrev64.32 q2, q1
+; CHECKBE-NEXT:    vrev64.32 q1, q0
  ; CHECKBE-NEXT:    vmovnt.i32 q1, q2
  ; CHECKBE-NEXT:    vrev64.16 q0, q1
  ; CHECKBE-NEXT:    bx lr
@@ -30,8 +30,8 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc2(<4 x i32> %src1, <4 x i32> %src
  ;
  ; CHECKBE-LABEL: vmovn32_trunc2:
  ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.16 q2, q0
-; CHECKBE-NEXT:    vrev64.16 q3, q1
+; CHECKBE-NEXT:    vrev64.32 q2, q0
+; CHECKBE-NEXT:    vrev64.32 q3, q1
  ; CHECKBE-NEXT:    vmovnt.i32 q3, q2
  ; CHECKBE-NEXT:    vrev64.16 q0, q3
  ; CHECKBE-NEXT:    bx lr
@@ -49,8 +49,8 @@ define arm_aapcs_vfpcc <16 x i8> @vmovn16_trunc1(<8 x i16> %src1, <8 x i16> %src
  ;
  ; CHECKBE-LABEL: vmovn16_trunc1:
  ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.8 q2, q1
-; CHECKBE-NEXT:    vrev64.8 q1, q0
+; CHECKBE-NEXT:    vrev64.16 q2, q1
+; CHECKBE-NEXT:    vrev64.16 q1, q0
  ; CHECKBE-NEXT:    vmovnt.i16 q1, q2
  ; CHECKBE-NEXT:    vrev64.8 q0, q1
  ; CHECKBE-NEXT:    bx lr
@@ -69,8 +69,8 @@ define arm_aapcs_vfpcc <16 x i8> @vmovn16_trunc2(<8 x i16> %src1, <8 x i16> %src
  ;
  ; CHECKBE-LABEL: vmovn16_trunc2:
  ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.8 q2, q0
-; CHECKBE-NEXT:    vrev64.8 q3, q1
+; CHECKBE-NEXT:    vrev64.16 q2, q0
+; CHECKBE-NEXT:    vrev64.16 q3, q1
  ; CHECKBE-NEXT:    vmovnt.i16 q3, q2
  ; CHECKBE-NEXT:    vrev64.8 q0, q3
  ; CHECKBE-NEXT:    bx lr
author	David Green <david.green@arm.com>
	Wed, 12 Feb 2020 20:51:39 +0000 (20:51 +0000)
committer	David Green <david.green@arm.com>
	Thu, 13 Feb 2020 09:56:46 +0000 (09:56 +0000)
llvm/lib/Target/ARM/ARMISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/ARM/ARMInstrInfo.td		patch \| blob \| history
llvm/lib/Target/ARM/ARMInstrMVE.td		patch \| blob \| history
llvm/lib/Target/ARM/ARMInstrNEON.td		patch \| blob \| history
llvm/test/CodeGen/ARM/neon-vmovn.ll		patch \| blob \| history
llvm/test/CodeGen/Thumb2/mve-vmovn.ll		patch \| blob \| history