From 9d4c59754110647f8cc8cdd4fef3114843c91d17 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 12 Feb 2020 20:51:39 +0000
Subject: [PATCH] [ARM] Fix ReconstructShuffle for bigendian

Simon pointed out that this function is doing a bitcast, which can be
incorrect for big endian. That makes the lowering of VMOVN in MVE
wrong, but the function is shared between Neon and MVE so both can
be incorrect.

This attempts to fix things by using the newly added VECTOR_REG_CAST
instead of the BITCAST. As it may now be used on Neon, I've added the
relevant patterns for it there too. I've also added a quick dag combine
for it to remove them where possible.

Differential Revision: https://reviews.llvm.org/D74485
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 28 ++++++++++++++++++++++++++--
 llvm/lib/Target/ARM/ARMInstrInfo.td     | 15 +++++++++++++++
 llvm/lib/Target/ARM/ARMInstrMVE.td      | 17 +----------------
 llvm/lib/Target/ARM/ARMInstrNEON.td     | 10 ++++++++++
 llvm/test/CodeGen/ARM/neon-vmovn.ll     |  4 ++--
 llvm/test/CodeGen/Thumb2/mve-vmovn.ll   | 16 ++++++++--------
 6 files changed, 62 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index f168eb1..e42f4de 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -7526,7 +7526,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
     if (SrcEltTy == SmallestEltTy)
       continue;
     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
-    Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
+    Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
     Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
     Src.WindowBase *= Src.WindowScale;
   }
@@ -7578,7 +7578,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
                                             ShuffleOps[1], Mask, DAG);
   if (!Shuffle)
     return SDValue();
-  return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+  return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
 }
 
 enum ShuffleOpCodes {
@@ -12952,6 +12952,28 @@ PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   return SDValue();
 }
 
+static SDValue
+PerformVECTOR_REG_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                              const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  SDLoc dl(N);
+
+  // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
+  if (ST->isLittle())
+    return DCI.DAG.getNode(ISD::BITCAST, dl, VT, Op);
+
+  // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
+  if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
+    // If the valuetypes are the same, we can remove the cast entirely.
+    if (Op->getOperand(0).getValueType() == VT)
+      return Op->getOperand(0);
+    return DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
+  }
+
+  return SDValue();
+}
+
 static SDValue PerformVCMPCombine(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const ARMSubtarget *Subtarget) {
@@ -14787,6 +14809,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
     return PerformARMBUILD_VECTORCombine(N, DCI);
   case ARMISD::PREDICATE_CAST:
     return PerformPREDICATE_CASTCombine(N, DCI);
+  case ARMISD::VECTOR_REG_CAST:
+    return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget);
   case ARMISD::VCMP:
     return PerformVCMPCombine(N, DCI, Subtarget);
   case ARMISD::SMULWB: {
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 672dfca..2674cc1 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -294,6 +294,21 @@ def ARMWLS      : SDNode<"ARMISD::WLS", SDT_ARMLoLoop, [SDNPHasChain]>;
 def ARMLE       : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>;
 def ARMLoopDec  : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>;
 
+// 'VECTOR_REG_CAST' is an operation that reinterprets the contents of a
+// vector register as a different vector type, without changing the contents of
+// the register. It differs from 'bitconvert' in that bitconvert reinterprets
+// the _memory_ storage format of the vector, whereas VECTOR_REG_CAST
+// reinterprets the _register_ format - and in big-endian, the memory and
+// register formats are different, so they are different operations.
+//
+// For example, 'VECTOR_REG_CAST' between v8i16 and v16i8 will map the LSB of
+// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness,
+// whereas 'bitconvert' will map it to the high byte in big-endian mode,
+// because that's what (MVE) VSTRH.16 followed by VLDRB.8 would do. So the
+// bitconvert would have to emit a VREV16.8 instruction, whereas the
+// VECTOR_REG_CAST emits no code at all if the vector is already in a register.
+def ARMVectorRegCast : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>;
+
 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.
 
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 2b95bce..8a03c50 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -3990,21 +3990,6 @@ let Predicates = [HasMVEInt] in {
 // vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles.
 def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>;
 
-// 'vector_reg_cast' is an operation that reinterprets the contents of an MVE
-// vector register as a different vector type, without changing the contents of
-// the register. It differs from 'bitconvert' in that bitconvert reinterprets
-// the _memory_ storage format of the vector, whereas vector_reg_cast
-// reinterprets the _register_ format - and in big-endian, the memory and
-// register formats are different, so they are different operations.
-//
-// For example, 'vector_reg_cast' between v8i16 and v16i8 will map the LSB of
-// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness,
-// whereas 'bitconvert' will map it to the high byte in big-endian mode,
-// because that's what VSTRH.16 followed by VLDRB.8 would do. So the bitconvert
-// would have to emit a VREV16.8 instruction, whereas the vector_reg_cast emits
-// no code at all if the vector is already in a register.
-def vector_reg_cast : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>;
-
 let Predicates = [HasMVEInt] in {
   foreach VT = [ v4i1, v8i1, v16i1 ] in {
     def : Pat<(i32 (predicate_cast (VT VCCR:$src))),
@@ -4019,7 +4004,7 @@ let Predicates = [HasMVEInt] in {
 
   foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
     foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
-      def : Pat<(VT (vector_reg_cast (VT2 MQPR:$src))), (VT MQPR:$src)>;
+      def : Pat<(VT (ARMVectorRegCast (VT2 MQPR:$src))), (VT MQPR:$src)>;
 }
 
 // end of MVE compares
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index a20acd8..d0801b8 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -7531,6 +7531,16 @@ let Predicates = [IsBE,HasNEON] in {
   def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8  QPR:$src)>;
 }
 
+let Predicates = [HasNEON] in {
+  foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
+    foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
+      def : Pat<(VT (ARMVectorRegCast (VT2 QPR:$src))), (VT QPR:$src)>;
+
+  foreach VT = [ v8i8, v4i16, v4f16, v2i32, v2f32, v1i64, f64 ] in
+    foreach VT2 = [ v8i8, v4i16, v4f16, v2i32, v2f32, v1i64, f64 ] in
+      def : Pat<(VT (ARMVectorRegCast (VT2 DPR:$src))), (VT DPR:$src)>;
+}
+
 // Use VLD1/VST1 + VREV for non-word-aligned v2f64 load/store on Big Endian
 let Predicates = [IsBE,HasNEON] in {
 def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
diff --git a/llvm/test/CodeGen/ARM/neon-vmovn.ll b/llvm/test/CodeGen/ARM/neon-vmovn.ll
index 675a38a..c16eace 100644
--- a/llvm/test/CodeGen/ARM/neon-vmovn.ll
+++ b/llvm/test/CodeGen/ARM/neon-vmovn.ll
@@ -732,8 +732,8 @@ define arm_aapcs_vfpcc <16 x i8> @test(<8 x i16> %src1, <8 x i16> %src2) {
 ;
 ; CHECKBE-LABEL: test:
 ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.8 q8, q1
-; CHECKBE-NEXT:    vrev64.8 q9, q0
+; CHECKBE-NEXT:    vrev64.16 q8, q1
+; CHECKBE-NEXT:    vrev64.16 q9, q0
 ; CHECKBE-NEXT:    vtrn.8 q9, q8
 ; CHECKBE-NEXT:    vrev64.8 q0, q9
 ; CHECKBE-NEXT:    bx lr
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
index ec34bdd..0a48179 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
@@ -10,8 +10,8 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc1(<4 x i32> %src1, <4 x i32> %src
 ;
 ; CHECKBE-LABEL: vmovn32_trunc1:
 ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.16 q2, q1
-; CHECKBE-NEXT:    vrev64.16 q1, q0
+; CHECKBE-NEXT:    vrev64.32 q2, q1
+; CHECKBE-NEXT:    vrev64.32 q1, q0
 ; CHECKBE-NEXT:    vmovnt.i32 q1, q2
 ; CHECKBE-NEXT:    vrev64.16 q0, q1
 ; CHECKBE-NEXT:    bx lr
@@ -30,8 +30,8 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc2(<4 x i32> %src1, <4 x i32> %src
 ;
 ; CHECKBE-LABEL: vmovn32_trunc2:
 ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.16 q2, q0
-; CHECKBE-NEXT:    vrev64.16 q3, q1
+; CHECKBE-NEXT:    vrev64.32 q2, q0
+; CHECKBE-NEXT:    vrev64.32 q3, q1
 ; CHECKBE-NEXT:    vmovnt.i32 q3, q2
 ; CHECKBE-NEXT:    vrev64.16 q0, q3
 ; CHECKBE-NEXT:    bx lr
@@ -49,8 +49,8 @@ define arm_aapcs_vfpcc <16 x i8> @vmovn16_trunc1(<8 x i16> %src1, <8 x i16> %src
 ;
 ; CHECKBE-LABEL: vmovn16_trunc1:
 ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.8 q2, q1
-; CHECKBE-NEXT:    vrev64.8 q1, q0
+; CHECKBE-NEXT:    vrev64.16 q2, q1
+; CHECKBE-NEXT:    vrev64.16 q1, q0
 ; CHECKBE-NEXT:    vmovnt.i16 q1, q2
 ; CHECKBE-NEXT:    vrev64.8 q0, q1
 ; CHECKBE-NEXT:    bx lr
@@ -69,8 +69,8 @@ define arm_aapcs_vfpcc <16 x i8> @vmovn16_trunc2(<8 x i16> %src1, <8 x i16> %src
 ;
 ; CHECKBE-LABEL: vmovn16_trunc2:
 ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.8 q2, q0
-; CHECKBE-NEXT:    vrev64.8 q3, q1
+; CHECKBE-NEXT:    vrev64.16 q2, q0
+; CHECKBE-NEXT:    vrev64.16 q3, q1
 ; CHECKBE-NEXT:    vmovnt.i16 q3, q2
 ; CHECKBE-NEXT:    vrev64.8 q0, q3
 ; CHECKBE-NEXT:    bx lr
-- 
2.7.4