From 02cd8a6b915a9dab32fdd91167f875ce5f67ebd4 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 22 Sep 2021 12:07:52 +0100
Subject: [PATCH] [ARM] Allow smaller VMOVL in tail predicated loops

This allows VMOVL in tail predicated loops so long as the the vector
size the VMOVL is extending into is less than or equal to the size of
the VCTP in the tail predicated loop. These cases represent a
sign-extend-inreg (or zero-extend-inreg), which needn't block tail
predication as in https://godbolt.org/z/hdTsEbx8Y.

For this a vecsize has been added to the TSFlag bits of MVE
instructions, which stores the size of the elements that the MVE
instruction operates on. In the case of multiple size (such as a
MVE_VMOVLs8bh that extends from i8 to i16, the largest size was be
chosen). The sizes are encoded as 00 = i8, 01 = i16, 10 = i32 and 11 =
i64, which often (but not always) comes from the instruction encoding
directly. A unit test was added, and although only a subset of the
vecsizes are currently used, the rest should be useful for other cases.

Differential Revision: https://reviews.llvm.org/D109706
---
 llvm/lib/Target/ARM/ARMInstrFormats.td         |   2 +
 llvm/lib/Target/ARM/ARMInstrMVE.td             | 576 ++++++++--------
 llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp    |  32 +-
 llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h |   8 +
 llvm/test/CodeGen/Thumb2/mve-vmovlloop.ll      |  65 +-
 llvm/unittests/Target/ARM/MachineInstrTest.cpp | 883 +++++++++++++++++++++++++
 6 files changed, 1234 insertions(+), 332 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td
index b00f974..de35137 100644
--- a/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -408,6 +408,7 @@ class InstTemplate<AddrMode am, int sz, IndexMode im,
   // in an IT block).
   bit thumbArithFlagSetting = 0;
 
+  bits<2> VecSize = 0;
   bit validForTailPredication = 0;
   bit retainsPreviousHalfElement = 0;
   bit horizontalReduction = 0;
@@ -428,6 +429,7 @@ class InstTemplate<AddrMode am, int sz, IndexMode im,
   let TSFlags{21}    = retainsPreviousHalfElement;
   let TSFlags{22}    = horizontalReduction;
   let TSFlags{23}    = doubleWidthResult;
+  let TSFlags{25-24} = VecSize;
 
   let Constraints = cstr;
   let Itinerary = itin;
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index d9c1c24..c0aad66 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -393,12 +393,13 @@ multiclass MVE_TwoOpPatternDup<MVEVectorVTInfo VTI, SDPatternOperator Op, Intrin
 // --------- Start of base classes for the instructions themselves
 
 class MVE_MI<dag oops, dag iops, InstrItinClass itin, string asm,
-             string ops, string cstr, list<dag> pattern>
+             string ops, string cstr, bits<2> vecsize, list<dag> pattern>
   : Thumb2XI<oops, iops, AddrModeNone, 4, itin, !strconcat(asm, "\t", ops), cstr,
              pattern>,
     Requires<[HasMVEInt]> {
   let D = MVEDomain;
   let DecoderNamespace = "MVE";
+  let VecSize = vecsize;
 }
 
 // MVE_p is used for most predicated instructions, to add the cluster
@@ -406,22 +407,22 @@ class MVE_MI<dag oops, dag iops, InstrItinClass itin, string asm,
 // the input predicate register.
 class MVE_p<dag oops, dag iops, InstrItinClass itin, string iname,
             string suffix, string ops, vpred_ops vpred, string cstr,
-            list<dag> pattern=[]>
+            bits<2> vecsize, list<dag> pattern=[]>
   : MVE_MI<oops, !con(iops, (ins vpred:$vp)), itin,
            // If the instruction has a suffix, like vadd.f32, then the
            // VPT predication suffix goes before the dot, so the full
            // name has to be "vadd${vp}.f32".
            !strconcat(iname, "${vp}",
                       !if(!eq(suffix, ""), "", !strconcat(".", suffix))),
-           ops, !strconcat(cstr, vpred.vpred_constraint), pattern> {
+           ops, !strconcat(cstr, vpred.vpred_constraint), vecsize, pattern> {
   let Inst{31-29} = 0b111;
   let Inst{27-26} = 0b11;
 }
 
 class MVE_f<dag oops, dag iops, InstrItinClass itin, string iname,
             string suffix, string ops, vpred_ops vpred, string cstr,
-            list<dag> pattern=[]>
-  : MVE_p<oops, iops, itin, iname, suffix, ops, vpred, cstr, pattern> {
+            bits<2> vecsize, list<dag> pattern=[]>
+  : MVE_p<oops, iops, itin, iname, suffix, ops, vpred, cstr, vecsize, pattern> {
   let Predicates = [HasMVEFloat];
 }
 
@@ -599,11 +600,11 @@ def MVE_URSHRL  : MVE_ScalarShiftDRegImm<"urshrl",  0b01, 0b1>;
 
 class MVE_rDest<dag oops, dag iops, InstrItinClass itin,
                 string iname, string suffix,
-                string ops, string cstr, list<dag> pattern=[]>
+                string ops, string cstr, bits<2> vecsize, list<dag> pattern=[]>
 // Always use vpred_n and not vpred_r: with the output register being
 // a GPR and not a vector register, there can't be any question of
 // what to put in its inactive lanes.
-  : MVE_p<oops, iops, itin, iname, suffix, ops, vpred_n, cstr, pattern> {
+  : MVE_p<oops, iops, itin, iname, suffix, ops, vpred_n, cstr, vecsize, pattern> {
 
   let Inst{25-23} = 0b101;
   let Inst{11-9} = 0b111;
@@ -613,7 +614,7 @@ class MVE_rDest<dag oops, dag iops, InstrItinClass itin,
 class MVE_VABAV<string suffix, bit U, bits<2> size>
   : MVE_rDest<(outs rGPR:$Rda), (ins rGPR:$Rda_src, MQPR:$Qn, MQPR:$Qm),
               NoItinerary, "vabav", suffix, "$Rda, $Qn, $Qm", "$Rda = $Rda_src",
-              []> {
+              size, []> {
   bits<4> Qm;
   bits<4> Qn;
   bits<4> Rda;
@@ -666,7 +667,7 @@ defm MVE_VABAVu32 : MVE_VABAV_m<MVE_v4u32>;
 class MVE_VADDV<string iname, string suffix, dag iops, string cstr,
               bit A, bit U, bits<2> size, list<dag> pattern=[]>
   : MVE_rDest<(outs tGPREven:$Rda), iops, NoItinerary,
-              iname, suffix, "$Rda, $Qm", cstr, pattern> {
+              iname, suffix, "$Rda, $Qm", cstr, size, pattern> {
   bits<3> Qm;
   bits<4> Rda;
 
@@ -764,7 +765,7 @@ defm MVE_VADDVu32 : MVE_VADDV_A<MVE_v4u32>;
 class MVE_VADDLV<string iname, string suffix, dag iops, string cstr,
                bit A, bit U, list<dag> pattern=[]>
   : MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname,
-              suffix, "$RdaLo, $RdaHi, $Qm", cstr, pattern> {
+              suffix, "$RdaLo, $RdaHi, $Qm", cstr, 0b10, pattern> {
   bits<3> Qm;
   bits<4> RdaLo;
   bits<4> RdaHi;
@@ -836,7 +837,7 @@ class MVE_VMINMAXNMV<string iname, string suffix, bit sz,
                      bit bit_17, bit bit_7, list<dag> pattern=[]>
   : MVE_rDest<(outs rGPR:$RdaDest), (ins rGPR:$RdaSrc, MQPR:$Qm),
               NoItinerary, iname, suffix, "$RdaSrc, $Qm",
-              "$RdaDest = $RdaSrc", pattern> {
+              "$RdaDest = $RdaSrc", !if(sz, 0b01, 0b10), pattern> {
   bits<3> Qm;
   bits<4> RdaDest;
 
@@ -897,7 +898,7 @@ defm MVE_VMAXNMAV: MVE_VMINMAXNMV_fty<"vmaxnmav", 0, 0, "int_arm_mve_maxnmav">;
 class MVE_VMINMAXV<string iname, string suffix, bit U, bits<2> size,
                  bit bit_17, bit bit_7, list<dag> pattern=[]>
   : MVE_rDest<(outs rGPR:$RdaDest), (ins rGPR:$RdaSrc, MQPR:$Qm), NoItinerary,
-              iname, suffix, "$RdaSrc, $Qm", "$RdaDest = $RdaSrc", pattern> {
+              iname, suffix, "$RdaSrc, $Qm", "$RdaDest = $RdaSrc", size, pattern> {
   bits<3> Qm;
   bits<4> RdaDest;
 
@@ -1020,9 +1021,10 @@ defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 1, "int_arm_mve_minav">;
 defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0, "int_arm_mve_maxav">;
 
 class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr,
-                   bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0>
+                   bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0,
+                   bits<2> vecsize>
   : MVE_rDest<(outs tGPREven:$RdaDest), iops, NoItinerary, iname, suffix,
-              "$RdaDest, $Qn, $Qm", cstr, []> {
+              "$RdaDest, $Qn, $Qm", cstr, vecsize, []> {
   bits<4> RdaDest;
   bits<3> Qm;
   bits<3> Qn;
@@ -1050,11 +1052,11 @@ multiclass MVE_VMLAMLSDAV_A<string iname, string x, MVEVectorVTInfo VTI,
                             bit sz, bit bit_28, bit X, bit bit_8, bit bit_0> {
   def ""#x#VTI.Suffix : MVE_VMLAMLSDAV<iname # x, VTI.Suffix,
                                    (ins MQPR:$Qn, MQPR:$Qm), "",
-                                   sz, bit_28, 0b0, X, bit_8, bit_0>;
+                                   sz, bit_28, 0b0, X, bit_8, bit_0, VTI.Size>;
   def "a"#x#VTI.Suffix : MVE_VMLAMLSDAV<iname # "a" # x, VTI.Suffix,
                                     (ins tGPREven:$RdaSrc, MQPR:$Qn, MQPR:$Qm),
                                     "$RdaDest = $RdaSrc",
-                                    sz, bit_28, 0b1, X, bit_8, bit_0>;
+                                    sz, bit_28, 0b1, X, bit_8, bit_0, VTI.Size>;
   let Predicates = [HasMVEInt] in {
     def : Pat<(i32 (int_arm_mve_vmldava
                             (i32 VTI.Unsigned),
@@ -1255,9 +1257,9 @@ foreach acc = ["", "a"] in {
 // Base class for VMLALDAV and VMLSLDAV, VRMLALDAVH, VRMLSLDAVH
 class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr,
                        bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0,
-                       list<dag> pattern=[]>
+                       bits<2> vecsize, list<dag> pattern=[]>
   : MVE_rDest<(outs tGPREven:$RdaLoDest, tGPROdd:$RdaHiDest), iops, NoItinerary,
-              iname, suffix, "$RdaLoDest, $RdaHiDest, $Qn, $Qm", cstr, pattern> {
+              iname, suffix, "$RdaLoDest, $RdaHiDest, $Qn, $Qm", cstr, vecsize, pattern> {
   bits<4> RdaLoDest;
   bits<4> RdaHiDest;
   bits<3> Qm;
@@ -1285,35 +1287,35 @@ class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr,
 }
 
 multiclass MVE_VMLALDAVBase_A<string iname, string x, string suffix,
-                               bit sz, bit bit_28, bit X, bit bit_8, bit bit_0,
-                               list<dag> pattern=[]> {
+                              bit sz, bit bit_28, bit X, bit bit_8, bit bit_0,
+                              bits<2> vecsize, list<dag> pattern=[]> {
   def ""#x#suffix : MVE_VMLALDAVBase<
      iname # x, suffix, (ins MQPR:$Qn, MQPR:$Qm), "",
-     sz, bit_28, 0b0, X, bit_8, bit_0, pattern>;
+     sz, bit_28, 0b0, X, bit_8, bit_0, vecsize, pattern>;
   def "a"#x#suffix : MVE_VMLALDAVBase<
      iname # "a" # x, suffix,
      (ins tGPREven:$RdaLoSrc, tGPROdd:$RdaHiSrc, MQPR:$Qn, MQPR:$Qm),
      "$RdaLoDest = $RdaLoSrc,$RdaHiDest = $RdaHiSrc",
-     sz, bit_28, 0b1, X, bit_8, bit_0, pattern>;
+     sz, bit_28, 0b1, X, bit_8, bit_0, vecsize, pattern>;
 }
 
 
 multiclass MVE_VMLALDAVBase_AX<string iname, string suffix, bit sz, bit bit_28,
-                               bit bit_8, bit bit_0, list<dag> pattern=[]> {
+                               bit bit_8, bit bit_0, bits<2> vecsize, list<dag> pattern=[]> {
   defm "" : MVE_VMLALDAVBase_A<iname, "", suffix, sz,
-                               bit_28, 0b0, bit_8, bit_0, pattern>;
+                               bit_28, 0b0, bit_8, bit_0, vecsize, pattern>;
   defm "" : MVE_VMLALDAVBase_A<iname, "x", suffix, sz,
-                               bit_28, 0b1, bit_8, bit_0, pattern>;
+                               bit_28, 0b1, bit_8, bit_0, vecsize, pattern>;
 }
 
-multiclass MVE_VRMLALDAVH_multi<string suffix, list<dag> pattern=[]> {
-  defm "" : MVE_VMLALDAVBase_AX<"vrmlaldavh", "s"#suffix,
-                                0b0, 0b0, 0b1, 0b0, pattern>;
-  defm "" : MVE_VMLALDAVBase_A<"vrmlaldavh", "", "u"#suffix,
-                               0b0, 0b1, 0b0, 0b1, 0b0, pattern>;
+multiclass MVE_VRMLALDAVH_multi<MVEVectorVTInfo VTI, list<dag> pattern=[]> {
+  defm "" : MVE_VMLALDAVBase_AX<"vrmlaldavh", "s"#VTI.BitsSuffix,
+                                0b0, 0b0, 0b1, 0b0, VTI.Size, pattern>;
+  defm "" : MVE_VMLALDAVBase_A<"vrmlaldavh", "", "u"#VTI.BitsSuffix,
+                               0b0, 0b1, 0b0, 0b1, 0b0, VTI.Size, pattern>;
 }
 
-defm MVE_VRMLALDAVH : MVE_VRMLALDAVH_multi<"32">;
+defm MVE_VRMLALDAVH : MVE_VRMLALDAVH_multi<MVE_v4i32>;
 
 // vrmlalvh aliases for vrmlaldavh
 def : MVEInstAlias<"vrmlalvh${vp}.s32\t$RdaLo, $RdaHi, $Qn, $Qm",
@@ -1333,14 +1335,15 @@ def : MVEInstAlias<"vrmlalvha${vp}.u32\t$RdaLo, $RdaHi, $Qn, $Qm",
                    tGPREven:$RdaLo, tGPROdd:$RdaHi,
                    MQPR:$Qn, MQPR:$Qm, vpred_n:$vp)>;
 
-multiclass MVE_VMLALDAV_multi<string suffix, bit sz, list<dag> pattern=[]> {
-  defm "" : MVE_VMLALDAVBase_AX<"vmlaldav", "s"#suffix, sz, 0b0, 0b0, 0b0, pattern>;
-  defm "" : MVE_VMLALDAVBase_A<"vmlaldav", "", "u"#suffix,
-                               sz, 0b1, 0b0, 0b0, 0b0, pattern>;
+multiclass MVE_VMLALDAV_multi<MVEVectorVTInfo VTI, list<dag> pattern=[]> {
+  defm "" : MVE_VMLALDAVBase_AX<"vmlaldav", "s"#VTI.BitsSuffix,
+                                VTI.Size{1}, 0b0, 0b0, 0b0, VTI.Size, pattern>;
+  defm "" : MVE_VMLALDAVBase_A<"vmlaldav", "", "u"#VTI.BitsSuffix,
+                               VTI.Size{1}, 0b1, 0b0, 0b0, 0b0, VTI.Size, pattern>;
 }
 
-defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"16", 0b0>;
-defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"32", 0b1>;
+defm MVE_VMLALDAV : MVE_VMLALDAV_multi<MVE_v8i16>;
+defm MVE_VMLALDAV : MVE_VMLALDAV_multi<MVE_v4i32>;
 
 let Predicates = [HasMVEInt] in {
   def : Pat<(ARMVMLALVs (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)),
@@ -1393,22 +1396,22 @@ foreach acc = ["", "a"] in {
 }
 
 multiclass MVE_VMLSLDAV_multi<string iname, string suffix, bit sz,
-                              bit bit_28, list<dag> pattern=[]> {
-  defm "" : MVE_VMLALDAVBase_AX<iname, suffix, sz, bit_28, 0b0, 0b1, pattern>;
+                              bit bit_28, bits<2> vecsize, list<dag> pattern=[]> {
+  defm "" : MVE_VMLALDAVBase_AX<iname, suffix, sz, bit_28, 0b0, 0b1, vecsize, pattern>;
 }
 
-defm MVE_VMLSLDAV   : MVE_VMLSLDAV_multi<"vmlsldav", "s16", 0b0, 0b0>;
-defm MVE_VMLSLDAV   : MVE_VMLSLDAV_multi<"vmlsldav", "s32", 0b1, 0b0>;
-defm MVE_VRMLSLDAVH : MVE_VMLSLDAV_multi<"vrmlsldavh", "s32", 0b0, 0b1>;
+defm MVE_VMLSLDAV   : MVE_VMLSLDAV_multi<"vmlsldav", "s16", 0b0, 0b0, 0b01>;
+defm MVE_VMLSLDAV   : MVE_VMLSLDAV_multi<"vmlsldav", "s32", 0b1, 0b0, 0b10>;
+defm MVE_VRMLSLDAVH : MVE_VMLSLDAV_multi<"vrmlsldavh", "s32", 0b0, 0b1, 0b10>;
 
 // end of mve_rDest instructions
 
 // start of mve_comp instructions
 
 class MVE_comp<InstrItinClass itin, string iname, string suffix,
-               string cstr, list<dag> pattern=[]>
+               string cstr, bits<2> vecsize, list<dag> pattern=[]>
   : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), itin, iname, suffix,
-           "$Qd, $Qn, $Qm", vpred_r, cstr, pattern> {
+           "$Qd, $Qn, $Qm", vpred_r, cstr, vecsize, pattern> {
   bits<4> Qd;
   bits<4> Qn;
   bits<4> Qm;
@@ -1425,15 +1428,15 @@ class MVE_comp<InstrItinClass itin, string iname, string suffix,
   let Inst{0} = 0b0;
 }
 
-class MVE_VMINMAXNM<string iname, string suffix, bit sz, bit bit_21,
+class MVE_VMINMAXNM<string iname, string suffix, bits<2> sz, bit bit_21,
                     list<dag> pattern=[]>
-  : MVE_comp<NoItinerary, iname, suffix, "", pattern> {
+  : MVE_comp<NoItinerary, iname, suffix, "", sz, pattern> {
 
   let Inst{28} = 0b1;
   let Inst{25-24} = 0b11;
   let Inst{23} = 0b0;
   let Inst{21} = bit_21;
-  let Inst{20} = sz;
+  let Inst{20} = sz{0};
   let Inst{11} = 0b1;
   let Inst{8} = 0b1;
   let Inst{6} = 0b1;
@@ -1444,7 +1447,7 @@ class MVE_VMINMAXNM<string iname, string suffix, bit sz, bit bit_21,
 }
 
 multiclass MVE_VMINMAXNM_m<string iname, bit bit_4, MVEVectorVTInfo VTI, SDNode Op, Intrinsic PredInt> {
-  def "" : MVE_VMINMAXNM<iname, VTI.Suffix, VTI.Size{0}, bit_4>;
+  def "" : MVE_VMINMAXNM<iname, VTI.Suffix, VTI.Size, bit_4>;
 
   let Predicates = [HasMVEFloat] in {
     defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 0)), !cast<Instruction>(NAME)>;
@@ -1459,7 +1462,7 @@ defm MVE_VMINNMf16 : MVE_VMINMAXNM_m<"vminnm", 0b1, MVE_v8f16, fminnum, int_arm_
 
 class MVE_VMINMAX<string iname, string suffix, bit U, bits<2> size,
               bit bit_4, list<dag> pattern=[]>
-  : MVE_comp<NoItinerary, iname, suffix, "", pattern> {
+  : MVE_comp<NoItinerary, iname, suffix, "", size, pattern> {
 
   let Inst{28} = U;
   let Inst{25-24} = 0b11;
@@ -1505,8 +1508,8 @@ defm MVE_VMAXu32  : MVE_VMAX<MVE_v4u32>;
 // start of mve_bit instructions
 
 class MVE_bit_arith<dag oops, dag iops, string iname, string suffix,
-                    string ops, string cstr, list<dag> pattern=[]>
-  : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred_r, cstr, pattern> {
+                    string ops, string cstr, bits<2> vecsize, list<dag> pattern=[]>
+  : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred_r, cstr, vecsize, pattern> {
   bits<4> Qd;
   bits<4> Qm;
 
@@ -1517,7 +1520,7 @@ class MVE_bit_arith<dag oops, dag iops, string iname, string suffix,
 }
 
 def MVE_VBIC : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
-                             "vbic", "", "$Qd, $Qn, $Qm", ""> {
+                             "vbic", "", "$Qd, $Qn, $Qm", "", 0b00> {
   bits<4> Qn;
 
   let Inst{28} = 0b0;
@@ -1533,9 +1536,10 @@ def MVE_VBIC : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
   let validForTailPredication = 1;
 }
 
-class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7, string cstr="">
+class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7,
+               bits<2> vecsize, string cstr="">
   : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), iname,
-                  suffix, "$Qd, $Qm", cstr> {
+                  suffix, "$Qd, $Qm", cstr, vecsize> {
 
   let Inst{28} = 0b1;
   let Inst{25-23} = 0b111;
@@ -1549,14 +1553,14 @@ class MVE_VREV<string iname, string suffix, bits<2> size, bits<2> bit_8_7, strin
   let Inst{0} = 0b0;
 }
 
-def MVE_VREV64_8  : MVE_VREV<"vrev64", "8", 0b00, 0b00, "@earlyclobber $Qd">;
-def MVE_VREV64_16 : MVE_VREV<"vrev64", "16", 0b01, 0b00, "@earlyclobber $Qd">;
-def MVE_VREV64_32 : MVE_VREV<"vrev64", "32", 0b10, 0b00, "@earlyclobber $Qd">;
+def MVE_VREV64_8  : MVE_VREV<"vrev64", "8", 0b00, 0b00, 0b11, "@earlyclobber $Qd">;
+def MVE_VREV64_16 : MVE_VREV<"vrev64", "16", 0b01, 0b00, 0b11, "@earlyclobber $Qd">;
+def MVE_VREV64_32 : MVE_VREV<"vrev64", "32", 0b10, 0b00, 0b11, "@earlyclobber $Qd">;
 
-def MVE_VREV32_8  : MVE_VREV<"vrev32", "8", 0b00, 0b01>;
-def MVE_VREV32_16 : MVE_VREV<"vrev32", "16", 0b01, 0b01>;
+def MVE_VREV32_8  : MVE_VREV<"vrev32", "8", 0b00, 0b01, 0b10>;
+def MVE_VREV32_16 : MVE_VREV<"vrev32", "16", 0b01, 0b01, 0b10>;
 
-def MVE_VREV16_8  : MVE_VREV<"vrev16", "8", 0b00, 0b10>;
+def MVE_VREV16_8  : MVE_VREV<"vrev16", "8", 0b00, 0b10, 0b01>;
 
 let Predicates = [HasMVEInt] in {
   def : Pat<(v8i16 (bswap (v8i16 MQPR:$src))),
@@ -1591,7 +1595,7 @@ let Predicates = [HasMVEInt] in {
 }
 
 def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm),
-                             "vmvn", "", "$Qd, $Qm", ""> {
+                             "vmvn", "", "$Qd, $Qm", "", 0b00> {
   let Inst{28} = 0b1;
   let Inst{25-23} = 0b111;
   let Inst{21-16} = 0b110000;
@@ -1614,7 +1618,7 @@ let Predicates = [HasMVEInt] in {
 
 class MVE_bit_ops<string iname, bits<2> bit_21_20, bit bit_28>
   : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
-                  iname, "", "$Qd, $Qn, $Qm", ""> {
+                  iname, "", "$Qd, $Qn, $Qm", "", 0b00> {
   bits<4> Qn;
 
   let Inst{28} = bit_28;
@@ -1685,9 +1689,9 @@ let Predicates = [HasMVEInt] in {
                           int_arm_mve_orn_predicated, (? ), MVE_VORN>;
 }
 
-class MVE_bit_cmode<string iname, string suffix, bit halfword, dag inOps>
+class MVE_bit_cmode<string iname, string suffix, bit halfword, dag inOps, bits<2> vecsize>
   : MVE_p<(outs MQPR:$Qd), inOps, NoItinerary,
-          iname, suffix, "$Qd, $imm", vpred_n, "$Qd = $Qd_src"> {
+          iname, suffix, "$Qd, $imm", vpred_n, "$Qd = $Qd_src", vecsize> {
   bits<12> imm;
   bits<4> Qd;
 
@@ -1710,7 +1714,7 @@ class MVE_bit_cmode<string iname, string suffix, bit halfword, dag inOps>
 multiclass MVE_bit_cmode_p<string iname, bit opcode,
                            MVEVectorVTInfo VTI, Operand imm_type, SDNode op> {
   def "" : MVE_bit_cmode<iname, VTI.Suffix, VTI.Size{0},
-                         (ins MQPR:$Qd_src, imm_type:$imm)> {
+                         (ins MQPR:$Qd_src, imm_type:$imm), VTI.Size> {
     let Inst{5} = opcode;
     let validForTailPredication = 1;
   }
@@ -1802,6 +1806,7 @@ class MVE_VMOV_lane_32<MVE_VMOV_lane_direction dir>
   let Inst{16} = Idx{1};
   let Inst{21} = Idx{0};
 
+  let VecSize = 0b10;
   let Predicates = [HasFPRegsV8_1M];
 }
 
@@ -1813,6 +1818,8 @@ class MVE_VMOV_lane_16<string suffix, bit U, MVE_VMOV_lane_direction dir>
   let Inst{16} = Idx{2};
   let Inst{21} = Idx{1};
   let Inst{6} = Idx{0};
+
+  let VecSize = 0b01;
 }
 
 class MVE_VMOV_lane_8<string suffix, bit U, MVE_VMOV_lane_direction dir>
@@ -1823,6 +1830,8 @@ class MVE_VMOV_lane_8<string suffix, bit U, MVE_VMOV_lane_direction dir>
   let Inst{21} = Idx{2};
   let Inst{6} = Idx{1};
   let Inst{5} = Idx{0};
+
+  let VecSize = 0b00;
 }
 
 def MVE_VMOV_from_lane_32  : MVE_VMOV_lane_32<            MVE_VMOV_from_lane>;
@@ -1933,7 +1942,7 @@ let Predicates = [HasMVEInt] in {
 
 class MVE_int<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
   : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary,
-          iname, suffix, "$Qd, $Qn, $Qm", vpred_r, "", pattern> {
+          iname, suffix, "$Qd, $Qn, $Qm", vpred_r, "", size, pattern> {
   bits<4> Qd;
   bits<4> Qn;
   bits<4> Qm;
@@ -2351,9 +2360,9 @@ defm MVE_VHSUBu8  : MVE_VHSUB<MVE_v16u8, subnuw, ARMvshruImm>;
 defm MVE_VHSUBu16 : MVE_VHSUB<MVE_v8u16, subnuw, ARMvshruImm>;
 defm MVE_VHSUBu32 : MVE_VHSUB<MVE_v4u32, subnuw, ARMvshruImm>;
 
-class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]>
+class MVE_VDUP<string suffix, bit B, bit E, bits<2> vecsize, list<dag> pattern=[]>
   : MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary,
-          "vdup", suffix, "$Qd, $Rt", vpred_r, "", pattern> {
+          "vdup", suffix, "$Qd, $Rt", vpred_r, "", vecsize, pattern> {
   bits<4> Qd;
   bits<4> Rt;
 
@@ -2372,9 +2381,9 @@ class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]>
   let validForTailPredication = 1;
 }
 
-def MVE_VDUP32 : MVE_VDUP<"32", 0b0, 0b0>;
-def MVE_VDUP16 : MVE_VDUP<"16", 0b0, 0b1>;
-def MVE_VDUP8  : MVE_VDUP<"8",  0b1, 0b0>;
+def MVE_VDUP32 : MVE_VDUP<"32", 0b0, 0b0, 0b10>;
+def MVE_VDUP16 : MVE_VDUP<"16", 0b0, 0b1, 0b01>;
+def MVE_VDUP8  : MVE_VDUP<"8",  0b1, 0b0, 0b00>;
 
 let Predicates = [HasMVEInt] in {
   def : Pat<(v16i8 (ARMvdup (i32 rGPR:$elem))),
@@ -2421,7 +2430,7 @@ let Predicates = [HasMVEInt] in {
 class MVEIntSingleSrc<string iname, string suffix, bits<2> size,
                          list<dag> pattern=[]>
   : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qm), NoItinerary,
-          iname, suffix, "$Qd, $Qm", vpred_r, "", pattern> {
+          iname, suffix, "$Qd, $Qm", vpred_r, "", size, pattern> {
   bits<4> Qd;
   bits<4> Qm;
 
@@ -2566,9 +2575,9 @@ defm MVE_VQABSNEG_Ps32 : vqabsneg_pattern<MVE_v4i32,
                                     MVE_VQABSs32, MVE_VQNEGs32>;
 
 class MVE_mod_imm<string iname, string suffix, bits<4> cmode, bit op,
-                  dag iops, list<dag> pattern=[]>
+                  dag iops, bits<2> vecsize, list<dag> pattern=[]>
   : MVE_p<(outs MQPR:$Qd), iops, NoItinerary, iname, suffix, "$Qd, $imm",
-          vpred_r, "", pattern> {
+          vpred_r, "", vecsize, pattern> {
   bits<13> imm;
   bits<4> Qd;
 
@@ -2591,21 +2600,21 @@ class MVE_mod_imm<string iname, string suffix, bits<4> cmode, bit op,
 
 let isReMaterializable = 1 in {
 let isAsCheapAsAMove = 1 in {
-def MVE_VMOVimmi8  : MVE_mod_imm<"vmov", "i8",  {1,1,1,0}, 0b0, (ins nImmSplatI8:$imm)>;
-def MVE_VMOVimmi16 : MVE_mod_imm<"vmov", "i16", {1,0,?,0}, 0b0, (ins nImmSplatI16:$imm)> {
+def MVE_VMOVimmi8  : MVE_mod_imm<"vmov", "i8",  {1,1,1,0}, 0b0, (ins nImmSplatI8:$imm), 0b00>;
+def MVE_VMOVimmi16 : MVE_mod_imm<"vmov", "i16", {1,0,?,0}, 0b0, (ins nImmSplatI16:$imm), 0b01> {
   let Inst{9} = imm{9};
 }
-def MVE_VMOVimmi32 : MVE_mod_imm<"vmov", "i32", {?,?,?,?}, 0b0, (ins nImmVMOVI32:$imm)> {
+def MVE_VMOVimmi32 : MVE_mod_imm<"vmov", "i32", {?,?,?,?}, 0b0, (ins nImmVMOVI32:$imm), 0b10> {
   let Inst{11-8} = imm{11-8};
 }
-def MVE_VMOVimmi64 : MVE_mod_imm<"vmov", "i64", {1,1,1,0}, 0b1, (ins nImmSplatI64:$imm)>;
-def MVE_VMOVimmf32 : MVE_mod_imm<"vmov", "f32", {1,1,1,1}, 0b0, (ins nImmVMOVF32:$imm)>;
+def MVE_VMOVimmi64 : MVE_mod_imm<"vmov", "i64", {1,1,1,0}, 0b1, (ins nImmSplatI64:$imm), 0b11>;
+def MVE_VMOVimmf32 : MVE_mod_imm<"vmov", "f32", {1,1,1,1}, 0b0, (ins nImmVMOVF32:$imm), 0b10>;
 } // let isAsCheapAsAMove = 1
 
-def MVE_VMVNimmi16 : MVE_mod_imm<"vmvn", "i16", {1,0,?,0}, 0b1, (ins nImmSplatI16:$imm)> {
+def MVE_VMVNimmi16 : MVE_mod_imm<"vmvn", "i16", {1,0,?,0}, 0b1, (ins nImmSplatI16:$imm), 0b01> {
   let Inst{9} = imm{9};
 }
-def MVE_VMVNimmi32 : MVE_mod_imm<"vmvn", "i32", {?,?,?,?}, 0b1, (ins nImmVMOVI32:$imm)> {
+def MVE_VMVNimmi32 : MVE_mod_imm<"vmvn", "i32", {?,?,?,?}, 0b1, (ins nImmVMOVI32:$imm), 0b10> {
   let Inst{11-8} = imm{11-8};
 }
 } // let isReMaterializable = 1
@@ -2642,7 +2651,7 @@ class MVE_VMINMAXA<string iname, string suffix, bits<2> size,
                    bit bit_12, list<dag> pattern=[]>
   : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),
           NoItinerary, iname, suffix, "$Qd, $Qm", vpred_n, "$Qd = $Qd_src",
-          pattern> {
+          size, pattern> {
   bits<4> Qd;
   bits<4> Qm;
 
@@ -2701,7 +2710,7 @@ defm MVE_VMAXAs32 : MVE_VMAXA<MVE_v4s32>;
 def MVE_VSHLC : MVE_p<(outs rGPR:$RdmDest, MQPR:$Qd),
                       (ins MQPR:$QdSrc, rGPR:$RdmSrc, long_shift:$imm),
                       NoItinerary, "vshlc", "", "$QdSrc, $RdmSrc, $imm",
-                      vpred_n, "$RdmDest = $RdmSrc,$Qd = $QdSrc"> {
+                      vpred_n, "$RdmDest = $RdmSrc,$Qd = $QdSrc", 0b10> {
   bits<5> imm;
   bits<4> Qd;
   bits<4> RdmDest;
@@ -2718,8 +2727,8 @@ def MVE_VSHLC : MVE_p<(outs rGPR:$RdmDest, MQPR:$Qd),
 
 class MVE_shift_imm<dag oops, dag iops, string iname, string suffix,
                     string ops, vpred_ops vpred, string cstr,
-                    list<dag> pattern=[]>
-  : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, pattern> {
+                    bits<2> vecsize, list<dag> pattern=[]>
+  : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, vecsize, pattern> {
   bits<4> Qd;
   bits<4> Qm;
 
@@ -2733,7 +2742,7 @@ class MVE_VMOVL<string iname, string suffix, bits<2> sz, bit U, bit top,
               list<dag> pattern=[]>
   : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm),
                   iname, suffix, "$Qd, $Qm", vpred_r, "",
-                  pattern> {
+                  sz, pattern> {
   let Inst{28} = U;
   let Inst{25-23} = 0b101;
   let Inst{21} = 0b1;
@@ -2799,9 +2808,9 @@ let Predicates = [HasMVEInt] in {
 
 
 class MVE_VSHLL_imm<string iname, string suffix, bit U, bit th,
-                    Operand immtype, list<dag> pattern=[]>
+                    Operand immtype, bits<2> vecsize, list<dag> pattern=[]>
   : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm, immtype:$imm),
-                  iname, suffix, "$Qd, $Qm, $imm", vpred_r, "", pattern> {
+                  iname, suffix, "$Qd, $Qm, $imm", vpred_r, "", vecsize, pattern> {
   let Inst{28} = U;
   let Inst{25-23} = 0b101;
   let Inst{21} = 0b1;
@@ -2822,7 +2831,7 @@ class MVE_VSHLL_imm<string iname, string suffix, bit U, bit th,
 
 class MVE_VSHLL_imm8<string iname, string suffix,
                      bit U, bit th, list<dag> pattern=[]>
-  : MVE_VSHLL_imm<iname, suffix, U, th, mve_shift_imm1_7, pattern> {
+  : MVE_VSHLL_imm<iname, suffix, U, th, mve_shift_imm1_7, 0b01, pattern> {
   bits<3> imm;
   let Inst{20-19} = 0b01;
   let Inst{18-16} = imm;
@@ -2830,7 +2839,7 @@ class MVE_VSHLL_imm8<string iname, string suffix,
 
 class MVE_VSHLL_imm16<string iname, string suffix,
                       bit U, bit th, list<dag> pattern=[]>
-  : MVE_VSHLL_imm<iname, suffix, U, th, mve_shift_imm1_15, pattern> {
+  : MVE_VSHLL_imm<iname, suffix, U, th, mve_shift_imm1_15, 0b10, pattern> {
   bits<4> imm;
   let Inst{20} = 0b1;
   let Inst{19-16} = imm;
@@ -2848,7 +2857,7 @@ def MVE_VSHLL_immu16th : MVE_VSHLL_imm16<"vshllt", "u16", 0b1, 0b1>;
 class MVE_VSHLL_by_lane_width<string iname, string suffix, bits<2> size,
                               bit U, string ops, list<dag> pattern=[]>
   : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm),
-                  iname, suffix, ops, vpred_r, "", pattern> {
+                  iname, suffix, ops, vpred_r, "", !if(size, 0b10, 0b01), pattern> {
   let Inst{28} = U;
   let Inst{25-23} = 0b100;
   let Inst{21-20} = 0b11;
@@ -2910,15 +2919,15 @@ foreach VTI = [MVE_v16s8, MVE_v8s16, MVE_v16u8, MVE_v8u16] in
   foreach top = [0, 1] in
     defm : MVE_VSHLL_patterns<VTI, top>;
 
-class MVE_shift_imm_partial<Operand imm, string iname, string suffix>
+class MVE_shift_imm_partial<Operand imm, string iname, string suffix, bits<2> vecsize>
   : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$QdSrc, MQPR:$Qm, imm:$imm),
-                  iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc"> {
+                  iname, suffix, "$Qd, $Qm, $imm", vpred_n, "$Qd = $QdSrc", vecsize> {
   Operand immediateType = imm;
 }
 
 class MVE_VxSHRN<string iname, string suffix, bit bit_12, bit bit_28,
-                 Operand imm>
-  : MVE_shift_imm_partial<imm, iname, suffix> {
+                 Operand imm, bits<2> vecsize>
+  : MVE_shift_imm_partial<imm, iname, suffix, vecsize> {
   bits<5> imm;
 
   let Inst{28} = bit_28;
@@ -2933,35 +2942,35 @@ class MVE_VxSHRN<string iname, string suffix, bit bit_12, bit bit_28,
   let retainsPreviousHalfElement = 1;
 }
 
-def MVE_VRSHRNi16bh : MVE_VxSHRN<"vrshrnb", "i16", 0b0, 0b1, shr_imm8> {
+def MVE_VRSHRNi16bh : MVE_VxSHRN<"vrshrnb", "i16", 0b0, 0b1, shr_imm8, 0b01> {
   let Inst{20-19} = 0b01;
 }
-def MVE_VRSHRNi16th : MVE_VxSHRN<"vrshrnt", "i16", 0b1, 0b1, shr_imm8> {
+def MVE_VRSHRNi16th : MVE_VxSHRN<"vrshrnt", "i16", 0b1, 0b1, shr_imm8, 0b01> {
   let Inst{20-19} = 0b01;
 }
-def MVE_VRSHRNi32bh : MVE_VxSHRN<"vrshrnb", "i32", 0b0, 0b1, shr_imm16> {
+def MVE_VRSHRNi32bh : MVE_VxSHRN<"vrshrnb", "i32", 0b0, 0b1, shr_imm16, 0b10> {
   let Inst{20} = 0b1;
 }
-def MVE_VRSHRNi32th : MVE_VxSHRN<"vrshrnt", "i32", 0b1, 0b1, shr_imm16> {
+def MVE_VRSHRNi32th : MVE_VxSHRN<"vrshrnt", "i32", 0b1, 0b1, shr_imm16, 0b10> {
   let Inst{20} = 0b1;
 }
 
-def MVE_VSHRNi16bh : MVE_VxSHRN<"vshrnb", "i16", 0b0, 0b0, shr_imm8> {
+def MVE_VSHRNi16bh : MVE_VxSHRN<"vshrnb", "i16", 0b0, 0b0, shr_imm8, 0b01> {
   let Inst{20-19} = 0b01;
 }
-def MVE_VSHRNi16th : MVE_VxSHRN<"vshrnt", "i16", 0b1, 0b0, shr_imm8> {
+def MVE_VSHRNi16th : MVE_VxSHRN<"vshrnt", "i16", 0b1, 0b0, shr_imm8, 0b01> {
   let Inst{20-19} = 0b01;
 }
-def MVE_VSHRNi32bh : MVE_VxSHRN<"vshrnb", "i32", 0b0, 0b0, shr_imm16> {
+def MVE_VSHRNi32bh : MVE_VxSHRN<"vshrnb", "i32", 0b0, 0b0, shr_imm16, 0b10> {
   let Inst{20} = 0b1;
 }
-def MVE_VSHRNi32th : MVE_VxSHRN<"vshrnt", "i32", 0b1, 0b0, shr_imm16> {
+def MVE_VSHRNi32th : MVE_VxSHRN<"vshrnt", "i32", 0b1, 0b0, shr_imm16, 0b10> {
   let Inst{20} = 0b1;
 }
 
 class MVE_VxQRSHRUN<string iname, string suffix, bit bit_28, bit bit_12,
-                    Operand imm>
-  : MVE_shift_imm_partial<imm, iname, suffix> {
+                    Operand imm, bits<2> vecsize>
+  : MVE_shift_imm_partial<imm, iname, suffix, vecsize> {
   bits<5> imm;
 
   let Inst{28} = bit_28;
@@ -2977,42 +2986,42 @@ class MVE_VxQRSHRUN<string iname, string suffix, bit bit_28, bit bit_12,
 }
 
 def MVE_VQRSHRUNs16bh : MVE_VxQRSHRUN<
-    "vqrshrunb", "s16", 0b1, 0b0, shr_imm8> {
+    "vqrshrunb", "s16", 0b1, 0b0, shr_imm8, 0b01> {
   let Inst{20-19} = 0b01;
 }
 def MVE_VQRSHRUNs16th : MVE_VxQRSHRUN<
-    "vqrshrunt", "s16", 0b1, 0b1, shr_imm8> {
+    "vqrshrunt", "s16", 0b1, 0b1, shr_imm8, 0b01> {
   let Inst{20-19} = 0b01;
 }
 def MVE_VQRSHRUNs32bh : MVE_VxQRSHRUN<
-    "vqrshrunb", "s32", 0b1, 0b0, shr_imm16> {
+    "vqrshrunb", "s32", 0b1, 0b0, shr_imm16, 0b10> {
   let Inst{20} = 0b1;
 }
 def MVE_VQRSHRUNs32th : MVE_VxQRSHRUN<
-    "vqrshrunt", "s32", 0b1, 0b1, shr_imm16> {
+    "vqrshrunt", "s32", 0b1, 0b1, shr_imm16, 0b10> {
   let Inst{20} = 0b1;
 }
 
 def MVE_VQSHRUNs16bh : MVE_VxQRSHRUN<
-    "vqshrunb", "s16", 0b0, 0b0, shr_imm8> {
+    "vqshrunb", "s16", 0b0, 0b0, shr_imm8, 0b01> {
   let Inst{20-19} = 0b01;
 }
 def MVE_VQSHRUNs16th : MVE_VxQRSHRUN<
-    "vqshrunt", "s16", 0b0, 0b1, shr_imm8> {
+    "vqshrunt", "s16", 0b0, 0b1, shr_imm8, 0b01> {
   let Inst{20-19} = 0b01;
 }
 def MVE_VQSHRUNs32bh : MVE_VxQRSHRUN<
-    "vqshrunb", "s32", 0b0, 0b0, shr_imm16> {
+    "vqshrunb", "s32", 0b0, 0b0, shr_imm16, 0b10> {
   let Inst{20} = 0b1;
 }
 def MVE_VQSHRUNs32th : MVE_VxQRSHRUN<
-    "vqshrunt", "s32", 0b0, 0b1, shr_imm16> {
+    "vqshrunt", "s32", 0b0, 0b1, shr_imm16, 0b10> {
   let Inst{20} = 0b1;
 }
 
 class MVE_VxQRSHRN<string iname, string suffix, bit bit_0, bit bit_12,
-                   Operand imm>
-  : MVE_shift_imm_partial<imm, iname, suffix> {
+                   Operand imm, bits<2> vecsize>
+  : MVE_shift_imm_partial<imm, iname, suffix, vecsize> {
   bits<5> imm;
 
   let Inst{25-23} = 0b101;
@@ -3027,19 +3036,19 @@ class MVE_VxQRSHRN<string iname, string suffix, bit bit_0, bit bit_12,
 }
 
 multiclass MVE_VxQRSHRN_types<string iname, bit bit_0, bit bit_12> {
-  def s16 : MVE_VxQRSHRN<iname, "s16", bit_0, bit_12, shr_imm8> {
+  def s16 : MVE_VxQRSHRN<iname, "s16", bit_0, bit_12, shr_imm8, 0b01> {
     let Inst{28} = 0b0;
     let Inst{20-19} = 0b01;
   }
-  def u16 : MVE_VxQRSHRN<iname, "u16", bit_0, bit_12, shr_imm8> {
+  def u16 : MVE_VxQRSHRN<iname, "u16", bit_0, bit_12, shr_imm8, 0b01> {
     let Inst{28} = 0b1;
     let Inst{20-19} = 0b01;
   }
-  def s32 : MVE_VxQRSHRN<iname, "s32", bit_0, bit_12, shr_imm16> {
+  def s32 : MVE_VxQRSHRN<iname, "s32", bit_0, bit_12, shr_imm16, 0b10> {
     let Inst{28} = 0b0;
     let Inst{20} = 0b1;
   }
-  def u32 : MVE_VxQRSHRN<iname, "u32", bit_0, bit_12, shr_imm16> {
+  def u32 : MVE_VxQRSHRN<iname, "u32", bit_0, bit_12, shr_imm16, 0b10> {
     let Inst{28} = 0b1;
     let Inst{20} = 0b1;
   }
@@ -3114,7 +3123,7 @@ defm : MVE_VSHRN_patterns<MVE_VQRSHRUNs32th, MVE_v8u16, MVE_v4s32, 1,1,1>;
 class MVE_shift_by_vec<string iname, string suffix, bit U,
                        bits<2> size, bit bit_4, bit bit_8>
   : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qm, MQPR:$Qn), NoItinerary,
-           iname, suffix, "$Qd, $Qm, $Qn", vpred_r, "", []> {
+           iname, suffix, "$Qd, $Qm, $Qn", vpred_r, "", size, []> {
   // Shift instructions which take a vector of shift counts
   bits<4> Qd;
   bits<4> Qm;
@@ -3189,8 +3198,8 @@ let Predicates = [HasMVEInt] in {
 
 class MVE_shift_with_imm<string iname, string suffix, dag oops, dag iops,
                          string ops, vpred_ops vpred, string cstr,
-                         list<dag> pattern=[]>
-  : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, pattern> {
+                         bits<2> vecsize, list<dag> pattern=[]>
+  : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, vecsize, pattern> {
   bits<4> Qd;
   bits<4> Qm;
 
@@ -3213,10 +3222,10 @@ class MVE_shift_with_imm<string iname, string suffix, dag oops, dag iops,
   dag unsignedFlag = (?);
 }
 
-class MVE_VSxI_imm<string iname, string suffix, bit bit_8, Operand immType>
+class MVE_VSxI_imm<string iname, string suffix, bit bit_8, Operand immType, bits<2> vecsize>
   : MVE_shift_with_imm<iname, suffix, (outs MQPR:$Qd),
                        (ins MQPR:$Qd_src, MQPR:$Qm, immType:$imm),
-                       "$Qd, $Qm, $imm", vpred_n, "$Qd = $Qd_src"> {
+                       "$Qd, $Qm, $imm", vpred_n, "$Qd = $Qd_src", vecsize> {
   bits<6> imm;
   let Inst{28} = 0b1;
   let Inst{25-24} = 0b11;
@@ -3228,27 +3237,27 @@ class MVE_VSxI_imm<string iname, string suffix, bit bit_8, Operand immType>
   Operand immediateType = immType;
 }
 
-def MVE_VSRIimm8 : MVE_VSxI_imm<"vsri", "8", 0b0, shr_imm8> {
+def MVE_VSRIimm8 : MVE_VSxI_imm<"vsri", "8", 0b0, shr_imm8, 0b00> {
   let Inst{21-19} = 0b001;
 }
 
-def MVE_VSRIimm16 : MVE_VSxI_imm<"vsri", "16", 0b0, shr_imm16> {
+def MVE_VSRIimm16 : MVE_VSxI_imm<"vsri", "16", 0b0, shr_imm16, 0b01> {
   let Inst{21-20} = 0b01;
 }
 
-def MVE_VSRIimm32 : MVE_VSxI_imm<"vsri", "32", 0b0, shr_imm32> {
+def MVE_VSRIimm32 : MVE_VSxI_imm<"vsri", "32", 0b0, shr_imm32, 0b10> {
   let Inst{21} = 0b1;
 }
 
-def MVE_VSLIimm8 : MVE_VSxI_imm<"vsli", "8", 0b1, imm0_7> {
+def MVE_VSLIimm8 : MVE_VSxI_imm<"vsli", "8", 0b1, imm0_7, 0b00> {
   let Inst{21-19} = 0b001;
 }
 
-def MVE_VSLIimm16 : MVE_VSxI_imm<"vsli", "16", 0b1, imm0_15> {
+def MVE_VSLIimm16 : MVE_VSxI_imm<"vsli", "16", 0b1, imm0_15, 0b01> {
   let Inst{21-20} = 0b01;
 }
 
-def MVE_VSLIimm32 : MVE_VSxI_imm<"vsli", "32", 0b1,imm0_31> {
+def MVE_VSLIimm32 : MVE_VSxI_imm<"vsli", "32", 0b1,imm0_31, 0b10> {
   let Inst{21} = 0b1;
 }
 
@@ -3277,7 +3286,7 @@ defm : MVE_VSxI_patterns<MVE_VSRIimm32, "vsri", MVE_v4i32>;
 class MVE_VQSHL_imm<MVEVectorVTInfo VTI_, Operand immType>
   : MVE_shift_with_imm<"vqshl", VTI_.Suffix, (outs MQPR:$Qd),
                        (ins MQPR:$Qm, immType:$imm), "$Qd, $Qm, $imm",
-                       vpred_r, ""> {
+                       vpred_r, "", VTI_.Size> {
   bits<6> imm;
 
   let Inst{28} = VTI_.Unsigned;
@@ -3317,7 +3326,7 @@ let unpred_int = int_arm_mve_vqshl_imm,
 class MVE_VQSHLU_imm<MVEVectorVTInfo VTI_, Operand immType>
   : MVE_shift_with_imm<"vqshlu", VTI_.Suffix, (outs MQPR:$Qd),
                        (ins MQPR:$Qm, immType:$imm), "$Qd, $Qm, $imm",
-                       vpred_r, ""> {
+                       vpred_r, "", VTI_.Size> {
   bits<6> imm;
 
   let Inst{28} = 0b1;
@@ -3347,7 +3356,7 @@ let unpred_int = int_arm_mve_vqshlu_imm,
 class MVE_VRSHR_imm<MVEVectorVTInfo VTI_, Operand immType>
   : MVE_shift_with_imm<"vrshr", VTI_.Suffix, (outs MQPR:$Qd),
                        (ins MQPR:$Qm, immType:$imm), "$Qd, $Qm, $imm",
-                       vpred_r, ""> {
+                       vpred_r, "", VTI_.Size> {
   bits<6> imm;
 
   let Inst{28} = VTI_.Unsigned;
@@ -3421,10 +3430,10 @@ defm : MVE_shift_imm_patterns<MVE_VRSHR_immu16>;
 defm : MVE_shift_imm_patterns<MVE_VRSHR_imms32>;
 defm : MVE_shift_imm_patterns<MVE_VRSHR_immu32>;
 
-class MVE_VSHR_imm<string suffix, dag imm>
+class MVE_VSHR_imm<string suffix, dag imm, bits<2> vecsize>
   : MVE_shift_with_imm<"vshr", suffix, (outs MQPR:$Qd),
                        !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm",
-                       vpred_r, ""> {
+                       vpred_r, "", vecsize> {
   bits<6> imm;
 
   let Inst{25-24} = 0b11;
@@ -3432,40 +3441,40 @@ class MVE_VSHR_imm<string suffix, dag imm>
   let Inst{10-8} = 0b000;
 }
 
-def MVE_VSHR_imms8 : MVE_VSHR_imm<"s8", (ins shr_imm8:$imm)> {
+def MVE_VSHR_imms8 : MVE_VSHR_imm<"s8", (ins shr_imm8:$imm), 0b00> {
   let Inst{28} = 0b0;
   let Inst{21-19} = 0b001;
 }
 
-def MVE_VSHR_immu8 : MVE_VSHR_imm<"u8", (ins shr_imm8:$imm)> {
+def MVE_VSHR_immu8 : MVE_VSHR_imm<"u8", (ins shr_imm8:$imm), 0b00> {
   let Inst{28} = 0b1;
   let Inst{21-19} = 0b001;
 }
 
-def MVE_VSHR_imms16 : MVE_VSHR_imm<"s16", (ins shr_imm16:$imm)> {
+def MVE_VSHR_imms16 : MVE_VSHR_imm<"s16", (ins shr_imm16:$imm), 0b01> {
   let Inst{28} = 0b0;
   let Inst{21-20} = 0b01;
 }
 
-def MVE_VSHR_immu16 : MVE_VSHR_imm<"u16", (ins shr_imm16:$imm)> {
+def MVE_VSHR_immu16 : MVE_VSHR_imm<"u16", (ins shr_imm16:$imm), 0b01> {
   let Inst{28} = 0b1;
   let Inst{21-20} = 0b01;
 }
 
-def MVE_VSHR_imms32 : MVE_VSHR_imm<"s32", (ins shr_imm32:$imm)> {
+def MVE_VSHR_imms32 : MVE_VSHR_imm<"s32", (ins shr_imm32:$imm), 0b10> {
   let Inst{28} = 0b0;
   let Inst{21} = 0b1;
 }
 
-def MVE_VSHR_immu32 : MVE_VSHR_imm<"u32", (ins shr_imm32:$imm)> {
+def MVE_VSHR_immu32 : MVE_VSHR_imm<"u32", (ins shr_imm32:$imm), 0b10> {
   let Inst{28} = 0b1;
   let Inst{21} = 0b1;
 }
 
-class MVE_VSHL_imm<string suffix, dag imm>
+class MVE_VSHL_imm<string suffix, dag imm, bits<2> vecsize>
   : MVE_shift_with_imm<"vshl", suffix, (outs MQPR:$Qd),
                        !con((ins MQPR:$Qm), imm), "$Qd, $Qm, $imm",
-                       vpred_r, ""> {
+                       vpred_r, "", vecsize> {
   bits<6> imm;
 
   let Inst{28} = 0b0;
@@ -3474,15 +3483,15 @@ class MVE_VSHL_imm<string suffix, dag imm>
   let Inst{10-8} = 0b101;
 }
 
-def MVE_VSHL_immi8 : MVE_VSHL_imm<"i8", (ins imm0_7:$imm)> {
+def MVE_VSHL_immi8 : MVE_VSHL_imm<"i8", (ins imm0_7:$imm), 0b00> {
   let Inst{21-19} = 0b001;
 }
 
-def MVE_VSHL_immi16 : MVE_VSHL_imm<"i16", (ins imm0_15:$imm)> {
+def MVE_VSHL_immi16 : MVE_VSHL_imm<"i16", (ins imm0_15:$imm), 0b01> {
   let Inst{21-20} = 0b01;
 }
 
-def MVE_VSHL_immi32 : MVE_VSHL_imm<"i32", (ins imm0_31:$imm)> {
+def MVE_VSHL_immi32 : MVE_VSHL_imm<"i32", (ins imm0_31:$imm), 0b10> {
   let Inst{21} = 0b1;
 }
 
@@ -3526,8 +3535,8 @@ let Predicates = [HasMVEInt] in {
 // start of MVE Floating Point instructions
 
 class MVE_float<string iname, string suffix, dag oops, dag iops, string ops,
-                vpred_ops vpred, string cstr, list<dag> pattern=[]>
-  : MVE_f<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, pattern> {
+                vpred_ops vpred, string cstr, bits<2> vecsize, list<dag> pattern=[]>
+  : MVE_f<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, vecsize, pattern> {
   bits<4> Qm;
 
   let Inst{12} = 0b0;
@@ -3540,7 +3549,7 @@ class MVE_float<string iname, string suffix, dag oops, dag iops, string ops,
 class MVE_VRINT<string rmode, bits<3> op, string suffix, bits<2> size,
                 list<dag> pattern=[]>
   : MVE_float<!strconcat("vrint", rmode), suffix, (outs MQPR:$Qd),
-              (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> {
+              (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", size, pattern> {
   bits<4> Qd;
 
   let Inst{28} = 0b1;
@@ -3587,16 +3596,16 @@ defm MVE_VRINTf32 : MVE_VRINT_ops<MVE_v4f32>;
 
 class MVEFloatArithNeon<string iname, string suffix, bit size,
                            dag oops, dag iops, string ops,
-                           vpred_ops vpred, string cstr, list<dag> pattern=[]>
-  : MVE_float<iname, suffix, oops, iops, ops, vpred, cstr, pattern> {
+                           vpred_ops vpred, string cstr, bits<2> vecsize, list<dag> pattern=[]>
+  : MVE_float<iname, suffix, oops, iops, ops, vpred, cstr, vecsize, pattern> {
   let Inst{20} = size;
   let Inst{16} = 0b0;
 }
 
-class MVE_VMUL_fp<string iname, string suffix, bit size, list<dag> pattern=[]>
-  : MVEFloatArithNeon<iname, suffix, size, (outs MQPR:$Qd),
+class MVE_VMUL_fp<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
+  : MVEFloatArithNeon<iname, suffix, size{0}, (outs MQPR:$Qd),
                       (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm", vpred_r, "",
-                      pattern> {
+                      size, pattern> {
   bits<4> Qd;
   bits<4> Qn;
 
@@ -3614,7 +3623,7 @@ class MVE_VMUL_fp<string iname, string suffix, bit size, list<dag> pattern=[]>
 
 multiclass MVE_VMULT_fp_m<string iname, MVEVectorVTInfo VTI,
                             SDNode Op, Intrinsic PredInt> {
-  def "" : MVE_VMUL_fp<iname, VTI.Suffix, VTI.Size{0}>;
+  def "" : MVE_VMUL_fp<iname, VTI.Suffix, VTI.Size>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEFloat] in {
@@ -3628,10 +3637,10 @@ multiclass MVE_VMUL_fp_m<MVEVectorVTInfo VTI>
 defm MVE_VMULf32 : MVE_VMUL_fp_m<MVE_v4f32>;
 defm MVE_VMULf16 : MVE_VMUL_fp_m<MVE_v8f16>;
 
-class MVE_VCMLA<string suffix, bit size>
-  : MVEFloatArithNeon<"vcmla", suffix, size, (outs MQPR:$Qd),
+class MVE_VCMLA<string suffix, bits<2> size>
+  : MVEFloatArithNeon<"vcmla", suffix, size{1}, (outs MQPR:$Qd),
                          (ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot),
-                         "$Qd, $Qn, $Qm, $rot", vpred_n, "$Qd = $Qd_src", []> {
+                         "$Qd, $Qn, $Qm, $rot", vpred_n, "$Qd = $Qd_src", size, []> {
   bits<4> Qd;
   bits<4> Qn;
   bits<2> rot;
@@ -3648,8 +3657,8 @@ class MVE_VCMLA<string suffix, bit size>
   let Inst{4} = 0b0;
 }
 
-multiclass MVE_VCMLA_m<MVEVectorVTInfo VTI, bit size> {
-  def "" : MVE_VCMLA<VTI.Suffix, size>;
+multiclass MVE_VCMLA_m<MVEVectorVTInfo VTI> {
+  def "" : MVE_VCMLA<VTI.Suffix, VTI.Size>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEFloat] in {
@@ -3671,16 +3680,16 @@ multiclass MVE_VCMLA_m<MVEVectorVTInfo VTI, bit size> {
   }
 }
 
-defm MVE_VCMLAf16 : MVE_VCMLA_m<MVE_v8f16, 0b0>;
-defm MVE_VCMLAf32 : MVE_VCMLA_m<MVE_v4f32, 0b1>;
+defm MVE_VCMLAf16 : MVE_VCMLA_m<MVE_v8f16>;
+defm MVE_VCMLAf32 : MVE_VCMLA_m<MVE_v4f32>;
 
-class MVE_VADDSUBFMA_fp<string iname, string suffix, bit size, bit bit_4,
+class MVE_VADDSUBFMA_fp<string iname, string suffix, bits<2> size, bit bit_4,
                         bit bit_8, bit bit_21, dag iops=(ins),
                         vpred_ops vpred=vpred_r, string cstr="",
                         list<dag> pattern=[]>
-  : MVEFloatArithNeon<iname, suffix, size, (outs MQPR:$Qd),
+  : MVEFloatArithNeon<iname, suffix, size{0}, (outs MQPR:$Qd),
                       !con(iops, (ins MQPR:$Qn, MQPR:$Qm)), "$Qd, $Qn, $Qm",
-                      vpred, cstr, pattern> {
+                      vpred, cstr, size, pattern> {
   bits<4> Qd;
   bits<4> Qn;
 
@@ -3698,7 +3707,7 @@ class MVE_VADDSUBFMA_fp<string iname, string suffix, bit size, bit bit_4,
 }
 
 multiclass MVE_VFMA_fp_multi<string iname, bit fms, MVEVectorVTInfo VTI> {
-  def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0b1, 0b0, fms,
+  def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size, 0b1, 0b0, fms,
                              (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
   defvar Inst = !cast<Instruction>(NAME);
   defvar pred_int = int_arm_mve_fma_predicated;
@@ -3739,7 +3748,7 @@ defm MVE_VFMSf16 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v8f16>;
 
 multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
                             SDNode Op, Intrinsic PredInt> {
-  def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0, 1, bit_21> {
+  def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size, 0, 1, bit_21> {
     let validForTailPredication = 1;
   }
   defvar Inst = !cast<Instruction>(NAME);
@@ -3760,10 +3769,10 @@ defm MVE_VADDf16 : MVE_VADD_fp_m<MVE_v8f16>;
 defm MVE_VSUBf32 : MVE_VSUB_fp_m<MVE_v4f32>;
 defm MVE_VSUBf16 : MVE_VSUB_fp_m<MVE_v8f16>;
 
-class MVE_VCADD<string suffix, bit size, string cstr="">
-  : MVEFloatArithNeon<"vcadd", suffix, size, (outs MQPR:$Qd),
+class MVE_VCADD<string suffix, bits<2> size, string cstr="">
+  : MVEFloatArithNeon<"vcadd", suffix, size{1}, (outs MQPR:$Qd),
                          (ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot),
-                         "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, []> {
+                         "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, size, []> {
   bits<4> Qd;
   bits<4> Qn;
   bit rot;
@@ -3781,8 +3790,8 @@ class MVE_VCADD<string suffix, bit size, string cstr="">
   let Inst{4} = 0b0;
 }
 
-multiclass MVE_VCADD_m<MVEVectorVTInfo VTI, bit size, string cstr=""> {
-  def "" : MVE_VCADD<VTI.Suffix, size, cstr>;
+multiclass MVE_VCADD_m<MVEVectorVTInfo VTI, string cstr=""> {
+  def "" : MVE_VCADD<VTI.Suffix, VTI.Size, cstr>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEFloat] in {
@@ -3802,12 +3811,12 @@ multiclass MVE_VCADD_m<MVEVectorVTInfo VTI, bit size, string cstr=""> {
   }
 }
 
-defm MVE_VCADDf16 : MVE_VCADD_m<MVE_v8f16, 0b0>;
-defm MVE_VCADDf32 : MVE_VCADD_m<MVE_v4f32, 0b1, "@earlyclobber $Qd">;
+defm MVE_VCADDf16 : MVE_VCADD_m<MVE_v8f16>;
+defm MVE_VCADDf32 : MVE_VCADD_m<MVE_v4f32, "@earlyclobber $Qd">;
 
-class MVE_VABD_fp<string suffix, bit size>
+class MVE_VABD_fp<string suffix, bits<2> size>
   : MVE_float<"vabd", suffix, (outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm),
-              "$Qd, $Qn, $Qm", vpred_r, ""> {
+              "$Qd, $Qn, $Qm", vpred_r, "", size> {
   bits<4> Qd;
   bits<4> Qn;
 
@@ -3815,7 +3824,7 @@ class MVE_VABD_fp<string suffix, bit size>
   let Inst{25-23} = 0b110;
   let Inst{22} = Qd{3};
   let Inst{21} = 0b1;
-  let Inst{20} = size;
+  let Inst{20} = size{0};
   let Inst{19-17} = Qn{2-0};
   let Inst{16} = 0b0;
   let Inst{15-13} = Qd{2-0};
@@ -3827,7 +3836,7 @@ class MVE_VABD_fp<string suffix, bit size>
 
 multiclass MVE_VABDT_fp_m<MVEVectorVTInfo VTI,
                             Intrinsic unpred_int, Intrinsic pred_int> {
-  def "" : MVE_VABD_fp<VTI.Suffix, VTI.Size{0}>;
+  def "" : MVE_VABD_fp<VTI.Suffix, VTI.Size>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEFloat] in {
@@ -3847,7 +3856,7 @@ multiclass MVE_VABD_fp_m<MVEVectorVTInfo VTI>
   : MVE_VABDT_fp_m<VTI, int_arm_mve_vabd, int_arm_mve_abd_predicated>;
 
 defm MVE_VABDf32 : MVE_VABD_fp_m<MVE_v4f32>;
-defm MVE_VABDf16 : MVE_VABD_fp_m<MVE_v8f16>; 
+defm MVE_VABDf16 : MVE_VABD_fp_m<MVE_v8f16>;
 
 let Predicates = [HasMVEFloat] in {
   def : Pat<(v8f16 (fabs (fsub (v8f16 MQPR:$Qm), (v8f16 MQPR:$Qn)))),
@@ -3860,7 +3869,7 @@ class MVE_VCVT_fix<string suffix, bit fsi, bit U, bit op,
                    Operand imm_operand_type>
   : MVE_float<"vcvt", suffix,
               (outs MQPR:$Qd), (ins MQPR:$Qm, imm_operand_type:$imm6),
-              "$Qd, $Qm, $imm6", vpred_r, "", []> {
+              "$Qd, $Qm, $imm6", vpred_r, "", !if(fsi, 0b10, 0b01), []> {
   bits<4> Qd;
   bits<6> imm6;
 
@@ -3943,7 +3952,7 @@ defm MVE_VCVTu32f32_fix : MVE_VCVT_fix_f32_m<0b1, 0b1, MVE_v4u32, MVE_v4f32>;
 class MVE_VCVT_fp_int_anpm<string suffix, bits<2> size, bit op, string anpm,
                 bits<2> rm, list<dag> pattern=[]>
   : MVE_float<!strconcat("vcvt", anpm), suffix, (outs MQPR:$Qd),
-              (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> {
+              (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", size, pattern> {
   bits<4> Qd;
 
   let Inst{28} = 0b1;
@@ -4000,7 +4009,7 @@ defm MVE_VCVTu32f32 : MVE_VCVT_fp_int_anpm_outer<MVE_v4u32, MVE_v4f32>;
 class MVE_VCVT_fp_int<string suffix, bits<2> size, bit toint, bit unsigned,
                       list<dag> pattern=[]>
   : MVE_float<"vcvt", suffix, (outs MQPR:$Qd),
-              (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> {
+              (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", size, pattern> {
   bits<4> Qd;
 
   let Inst{28} = 0b1;
@@ -4063,7 +4072,7 @@ let Predicates = [HasMVEFloat] in {
 class MVE_VABSNEG_fp<string iname, string suffix, bits<2> size, bit negate,
                    list<dag> pattern=[]>
   : MVE_float<iname, suffix, (outs MQPR:$Qd),
-              (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> {
+              (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", size, pattern> {
   bits<4> Qd;
 
   let Inst{28} = 0b1;
@@ -4102,15 +4111,15 @@ defm MVE_VNEGf16 : MVE_VABSNEG_fp_m<"vneg", fneg, int_arm_mve_neg_predicated,
 defm MVE_VNEGf32 : MVE_VABSNEG_fp_m<"vneg", fneg, int_arm_mve_neg_predicated,
                                     MVE_v4f32, 1>;
 
-class MVE_VMAXMINNMA<string iname, string suffix, bit size, bit bit_12,
+class MVE_VMAXMINNMA<string iname, string suffix, bits<2> size, bit bit_12,
                      list<dag> pattern=[]>
   : MVE_f<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),
           NoItinerary, iname, suffix, "$Qd, $Qm", vpred_n, "$Qd = $Qd_src",
-          pattern> {
+          size, pattern> {
   bits<4> Qd;
   bits<4> Qm;
 
-  let Inst{28} = size;
+  let Inst{28} = size{0};
   let Inst{25-23} = 0b100;
   let Inst{22} = Qd{3};
   let Inst{21-16} = 0b111111;
@@ -4129,7 +4138,7 @@ class MVE_VMAXMINNMA<string iname, string suffix, bit size, bit bit_12,
 multiclass MVE_VMAXMINNMA_m<string iname, MVEVectorVTInfo VTI,
                       SDNode unpred_op, Intrinsic pred_int,
                       bit bit_12> {
-  def "" : MVE_VMAXMINNMA<iname, VTI.Suffix, VTI.Size{0}, bit_12>;
+  def "" : MVE_VMAXMINNMA<iname, VTI.Suffix, VTI.Size, bit_12>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEInt] in {
@@ -4163,9 +4172,9 @@ defm MVE_VMINNMAf16 : MVE_VMINNMA<MVE_v8f16, 0b1>;
 // start of MVE compares
 
 class MVE_VCMPqq<string suffix, bit bit_28, bits<2> bits_21_20,
-                 VCMPPredicateOperand predtype, list<dag> pattern=[]>
+                 VCMPPredicateOperand predtype, bits<2> vecsize, list<dag> pattern=[]>
   : MVE_p<(outs VCCR:$P0), (ins MQPR:$Qn, MQPR:$Qm, predtype:$fc),
-           NoItinerary, "vcmp", suffix, "$fc, $Qn, $Qm", vpred_n, "", pattern> {
+           NoItinerary, "vcmp", suffix, "$fc, $Qn, $Qm", vpred_n, "", vecsize, pattern> {
   // Base class for comparing two vector registers
   bits<3> fc;
   bits<4> Qn;
@@ -4200,24 +4209,24 @@ class MVE_VCMPqq<string suffix, bit bit_28, bits<2> bits_21_20,
 }
 
 class MVE_VCMPqqf<string suffix, bit size>
-    : MVE_VCMPqq<suffix, size, 0b11, pred_basic_fp> {
+    : MVE_VCMPqq<suffix, size, 0b11, pred_basic_fp, !if(size, 0b01, 0b10)> {
   let Predicates = [HasMVEFloat];
 }
 
 class MVE_VCMPqqi<string suffix, bits<2> size>
-    : MVE_VCMPqq<suffix, 0b1, size, pred_basic_i> {
+    : MVE_VCMPqq<suffix, 0b1, size, pred_basic_i, size> {
   let Inst{12} = 0b0;
   let Inst{0} = 0b0;
 }
 
 class MVE_VCMPqqu<string suffix, bits<2> size>
-    : MVE_VCMPqq<suffix, 0b1, size, pred_basic_u> {
+    : MVE_VCMPqq<suffix, 0b1, size, pred_basic_u, size> {
   let Inst{12} = 0b0;
   let Inst{0} = 0b1;
 }
 
 class MVE_VCMPqqs<string suffix, bits<2> size>
-    : MVE_VCMPqq<suffix, 0b1, size, pred_basic_s> {
+    : MVE_VCMPqq<suffix, 0b1, size, pred_basic_s, size> {
   let Inst{12} = 0b1;
 }
 
@@ -4237,9 +4246,9 @@ def MVE_VCMPs16 : MVE_VCMPqqs<"s16", 0b01>;
 def MVE_VCMPs32 : MVE_VCMPqqs<"s32", 0b10>;
 
 class MVE_VCMPqr<string suffix, bit bit_28, bits<2> bits_21_20,
-                 VCMPPredicateOperand predtype, list<dag> pattern=[]>
+                 VCMPPredicateOperand predtype, bits<2> vecsize, list<dag> pattern=[]>
   : MVE_p<(outs VCCR:$P0), (ins MQPR:$Qn, GPRwithZR:$Rm, predtype:$fc),
-           NoItinerary, "vcmp", suffix, "$fc, $Qn, $Rm", vpred_n, "", pattern> {
+           NoItinerary, "vcmp", suffix, "$fc, $Qn, $Rm", vpred_n, "", vecsize, pattern> {
   // Base class for comparing a vector register with a scalar
   bits<3> fc;
   bits<4> Qn;
@@ -4265,24 +4274,24 @@ class MVE_VCMPqr<string suffix, bit bit_28, bits<2> bits_21_20,
 }
 
 class MVE_VCMPqrf<string suffix, bit size>
-    : MVE_VCMPqr<suffix, size, 0b11, pred_basic_fp> {
+    : MVE_VCMPqr<suffix, size, 0b11, pred_basic_fp, !if(size, 0b01, 0b10)> {
   let Predicates = [HasMVEFloat];
 }
 
 class MVE_VCMPqri<string suffix, bits<2> size>
-    : MVE_VCMPqr<suffix, 0b1, size, pred_basic_i> {
+    : MVE_VCMPqr<suffix, 0b1, size, pred_basic_i, size> {
   let Inst{12} = 0b0;
   let Inst{5} = 0b0;
 }
 
 class MVE_VCMPqru<string suffix, bits<2> size>
-    : MVE_VCMPqr<suffix, 0b1, size, pred_basic_u> {
+    : MVE_VCMPqr<suffix, 0b1, size, pred_basic_u, size> {
   let Inst{12} = 0b0;
   let Inst{5} = 0b1;
 }
 
 class MVE_VCMPqrs<string suffix, bits<2> size>
-    : MVE_VCMPqr<suffix, 0b1, size, pred_basic_s> {
+    : MVE_VCMPqr<suffix, 0b1, size, pred_basic_s, size> {
   let Inst{12} = 0b1;
 }
 
@@ -4490,9 +4499,9 @@ let Predicates = [HasMVEInt] in {
 
 class MVE_qDest_qSrc<string iname, string suffix, dag oops, dag iops,
                      string ops, vpred_ops vpred, string cstr,
-                     list<dag> pattern=[]>
+                     bits<2> vecsize, list<dag> pattern=[]>
   : MVE_p<oops, iops, NoItinerary, iname, suffix,
-          ops, vpred, cstr, pattern> {
+          ops, vpred, cstr, vecsize, pattern> {
   bits<4> Qd;
   bits<4> Qm;
 
@@ -4507,10 +4516,11 @@ class MVE_qDest_qSrc<string iname, string suffix, dag oops, dag iops,
 }
 
 class MVE_VQxDMLxDH<string iname, bit exch, bit round, bit subtract,
-                    string suffix, bits<2> size, string cstr="", list<dag> pattern=[]>
+                    string suffix, bits<2> size, string cstr="",
+                    list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
-                   vpred_n, "$Qd = $Qd_src"#cstr, pattern> {
+                   vpred_n, "$Qd = $Qd_src"#cstr, size, pattern> {
   bits<4> Qn;
 
   let Inst{28} = subtract;
@@ -4560,14 +4570,15 @@ defm MVE_VQDMLSDHX  : MVE_VQxDMLxDH_multi<"vqdmlsdhx",  0b1, 0b0, 0b1>;
 defm MVE_VQRDMLSDH  : MVE_VQxDMLxDH_multi<"vqrdmlsdh",  0b0, 0b1, 0b1>;
 defm MVE_VQRDMLSDHX : MVE_VQxDMLxDH_multi<"vqrdmlsdhx", 0b1, 0b1, 0b1>;
 
-class MVE_VCMUL<string iname, string suffix, bit size, string cstr="">
+class MVE_VCMUL<string iname, string suffix, bits<2> size, string cstr="">
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot),
-                   "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, []> {
+                   "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, size,
+                   []> {
   bits<4> Qn;
   bits<2> rot;
 
-  let Inst{28} = size;
+  let Inst{28} = size{1};
   let Inst{21-20} = 0b11;
   let Inst{19-17} = Qn{2-0};
   let Inst{16} = 0b0;
@@ -4580,8 +4591,8 @@ class MVE_VCMUL<string iname, string suffix, bit size, string cstr="">
 }
 
 multiclass MVE_VCMUL_m<string iname, MVEVectorVTInfo VTI,
-                       bit size, string cstr=""> {
-  def "" : MVE_VCMUL<iname, VTI.Suffix, size, cstr>;
+                       string cstr=""> {
+  def "" : MVE_VCMUL<iname, VTI.Suffix, VTI.Size, cstr>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEFloat] in {
@@ -4601,14 +4612,14 @@ multiclass MVE_VCMUL_m<string iname, MVEVectorVTInfo VTI,
   }
 }
 
-defm MVE_VCMULf16 : MVE_VCMUL_m<"vcmul", MVE_v8f16, 0b0>;
-defm MVE_VCMULf32 : MVE_VCMUL_m<"vcmul", MVE_v4f32, 0b1, "@earlyclobber $Qd">;
+defm MVE_VCMULf16 : MVE_VCMUL_m<"vcmul", MVE_v8f16>;
+defm MVE_VCMULf32 : MVE_VCMUL_m<"vcmul", MVE_v4f32, "@earlyclobber $Qd">;
 
 class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20,
-                bit T, string cstr, list<dag> pattern=[]>
+                bit T, string cstr, bits<2> vecsize, list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
-                   vpred_r, cstr, pattern> {
+                   vpred_r, cstr, vecsize, pattern> {
   bits<4> Qd;
   bits<4> Qn;
   bits<4> Qm;
@@ -4627,9 +4638,9 @@ class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20,
 
 multiclass MVE_VMULL_m<MVEVectorVTInfo VTI,
                        SDPatternOperator unpred_op, Intrinsic pred_int,
-                       bit Top, string cstr=""> {
+                       bit Top, bits<2> vecsize, string cstr=""> {
   def "" : MVE_VMULL<"vmull" # !if(Top, "t", "b"), VTI.Suffix, VTI.Unsigned,
-                     VTI.Size, Top, cstr>;
+                     VTI.Size, Top, cstr, vecsize>;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEInt] in {
@@ -4656,43 +4667,43 @@ multiclass MVE_VMULL_m<MVEVectorVTInfo VTI,
 // the unsigned bit switches to encoding the size.
 
 defm MVE_VMULLBs8  : MVE_VMULL_m<MVE_v16s8, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b0>;
+                                 int_arm_mve_mull_int_predicated, 0b0, 0b01>;
 defm MVE_VMULLTs8  : MVE_VMULL_m<MVE_v16s8, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b1>;
+                                 int_arm_mve_mull_int_predicated, 0b1, 0b01>;
 defm MVE_VMULLBs16 : MVE_VMULL_m<MVE_v8s16, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b0>;
+                                 int_arm_mve_mull_int_predicated, 0b0, 0b10>;
 defm MVE_VMULLTs16 : MVE_VMULL_m<MVE_v8s16, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b1>;
+                                 int_arm_mve_mull_int_predicated, 0b1, 0b10>;
 defm MVE_VMULLBs32 : MVE_VMULL_m<MVE_v4s32, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b0,
+                                 int_arm_mve_mull_int_predicated, 0b0, 0b11,
                                  "@earlyclobber $Qd">;
 defm MVE_VMULLTs32 : MVE_VMULL_m<MVE_v4s32, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b1,
+                                 int_arm_mve_mull_int_predicated, 0b1, 0b11,
                                  "@earlyclobber $Qd">;
 
 defm MVE_VMULLBu8  : MVE_VMULL_m<MVE_v16u8, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b0>;
+                                 int_arm_mve_mull_int_predicated, 0b0, 0b01>;
 defm MVE_VMULLTu8  : MVE_VMULL_m<MVE_v16u8, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b1>;
+                                 int_arm_mve_mull_int_predicated, 0b1, 0b01>;
 defm MVE_VMULLBu16 : MVE_VMULL_m<MVE_v8u16, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b0>;
+                                 int_arm_mve_mull_int_predicated, 0b0, 0b10>;
 defm MVE_VMULLTu16 : MVE_VMULL_m<MVE_v8u16, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b1>;
+                                 int_arm_mve_mull_int_predicated, 0b1, 0b10>;
 defm MVE_VMULLBu32 : MVE_VMULL_m<MVE_v4u32, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b0,
+                                 int_arm_mve_mull_int_predicated, 0b0, 0b11,
                                  "@earlyclobber $Qd">;
 defm MVE_VMULLTu32 : MVE_VMULL_m<MVE_v4u32, int_arm_mve_vmull,
-                                 int_arm_mve_mull_int_predicated, 0b1,
+                                 int_arm_mve_mull_int_predicated, 0b1, 0b11,
                                  "@earlyclobber $Qd">;
 
 defm MVE_VMULLBp8  : MVE_VMULL_m<MVE_v16p8, int_arm_mve_vmull_poly,
-                                 int_arm_mve_mull_poly_predicated, 0b0>;
+                                 int_arm_mve_mull_poly_predicated, 0b0, 0b01>;
 defm MVE_VMULLTp8  : MVE_VMULL_m<MVE_v16p8, int_arm_mve_vmull_poly,
-                                 int_arm_mve_mull_poly_predicated, 0b1>;
+                                 int_arm_mve_mull_poly_predicated, 0b1, 0b01>;
 defm MVE_VMULLBp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly,
-                                 int_arm_mve_mull_poly_predicated, 0b0>;
+                                 int_arm_mve_mull_poly_predicated, 0b0, 0b10>;
 defm MVE_VMULLTp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly,
-                                 int_arm_mve_mull_poly_predicated, 0b1>;
+                                 int_arm_mve_mull_poly_predicated, 0b1, 0b10>;
 
 let Predicates = [HasMVEInt] in {
   def : Pat<(v2i64 (ARMvmulls (v4i32 MQPR:$src1), (v4i32 MQPR:$src2))),
@@ -4742,7 +4753,7 @@ class MVE_VxMULH<string iname, string suffix, bit U, bits<2> size, bit round,
                  list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
-                   vpred_r, "", pattern> {
+                   vpred_r, "", size, pattern> {
   bits<4> Qn;
 
   let Inst{28} = U;
@@ -4807,7 +4818,7 @@ class MVE_VxMOVxN<string iname, string suffix, bit bit_28, bit bit_17,
                   bits<2> size, bit T, list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qd_src, MQPR:$Qm), "$Qd, $Qm",
-                   vpred_n, "$Qd = $Qd_src", pattern> {
+                   vpred_n, "$Qd = $Qd_src", !if(size, 0b10, 0b01), pattern> {
 
   let Inst{28} = bit_28;
   let Inst{21-20} = 0b11;
@@ -4952,7 +4963,7 @@ class MVE_VCVT_ff<string iname, string suffix, bit op, bit T,
                   dag iops_extra, vpred_ops vpred, string cstr>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    !con(iops_extra, (ins MQPR:$Qm)), "$Qd, $Qm",
-                   vpred, cstr, []> {
+                   vpred, cstr, 0b10, []> {
   let Inst{28} = op;
   let Inst{21-16} = 0b111111;
   let Inst{12} = T;
@@ -5015,7 +5026,7 @@ class MVE_VxCADD<string iname, string suffix, bits<2> size, bit halve,
                  string cstr="">
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qn, MQPR:$Qm, complexrotateopodd:$rot),
-                   "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, []> {
+                   "$Qd, $Qn, $Qm, $rot", vpred_r, cstr, size, []> {
   bits<4> Qn;
   bit rot;
 
@@ -5063,7 +5074,7 @@ class MVE_VADCSBC<string iname, bit I, bit subtract,
                   dag carryin, list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, "i32", (outs MQPR:$Qd, cl_FPSCR_NZCV:$carryout),
                    !con((ins MQPR:$Qn, MQPR:$Qm), carryin),
-                   "$Qd, $Qn, $Qm", vpred_r, "", pattern> {
+                   "$Qd, $Qn, $Qm", vpred_r, "", 0b10, pattern> {
   bits<4> Qn;
 
   let Inst{28} = subtract;
@@ -5090,7 +5101,7 @@ class MVE_VQDMULL<string iname, string suffix, bit size, bit T,
                   string cstr="", list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
-                   vpred_r, cstr, pattern> {
+                   vpred_r, cstr, !if(size, 0b10, 0b01), pattern> {
   bits<4> Qn;
 
   let Inst{28} = size;
@@ -5139,8 +5150,8 @@ defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<MVE_v4s32, 0b1, "@earlyclobber $Qd">;
 // start of mve_qDest_rSrc
 
 class MVE_qr_base<dag oops, dag iops, string iname, string suffix, string ops,
-                  vpred_ops vpred, string cstr, list<dag> pattern=[]>
-   : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, pattern> {
+                  vpred_ops vpred, string cstr, bits<2> vecsize, list<dag> pattern=[]>
+   : MVE_p<oops, iops, NoItinerary, iname, suffix, ops, vpred, cstr, vecsize, pattern> {
   bits<4> Qd;
   bits<4> Qn;
   bits<4> Rm;
@@ -5156,19 +5167,19 @@ class MVE_qr_base<dag oops, dag iops, string iname, string suffix, string ops,
   let Inst{3-0} = Rm{3-0};
 }
 
-class MVE_qDest_rSrc<string iname, string suffix, string cstr="", list<dag> pattern=[]>
+class MVE_qDest_rSrc<string iname, string suffix, string cstr="", bits<2> vecsize, list<dag> pattern=[]>
   : MVE_qr_base<(outs MQPR:$Qd), (ins MQPR:$Qn, rGPR:$Rm),
-           iname, suffix, "$Qd, $Qn, $Rm", vpred_r, cstr,
-           pattern>;
+                 iname, suffix, "$Qd, $Qn, $Rm", vpred_r, cstr,
+                 vecsize, pattern>;
 
-class MVE_qDestSrc_rSrc<string iname, string suffix, list<dag> pattern=[]>
+class MVE_qDestSrc_rSrc<string iname, string suffix, bits<2> vecsize, list<dag> pattern=[]>
   : MVE_qr_base<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qn, rGPR:$Rm),
-           iname, suffix, "$Qd, $Qn, $Rm", vpred_n, "$Qd = $Qd_src",
-           pattern>;
+                 iname, suffix, "$Qd, $Qn, $Rm", vpred_n, "$Qd = $Qd_src",
+                 vecsize, pattern>;
 
-class MVE_qDest_single_rSrc<string iname, string suffix, list<dag> pattern=[]>
+class MVE_qDest_single_rSrc<string iname, string suffix, bits<2> vecsize, list<dag> pattern=[]>
   : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qd_src, rGPR:$Rm), NoItinerary, iname,
-          suffix, "$Qd, $Rm", vpred_n, "$Qd = $Qd_src", pattern> {
+          suffix, "$Qd, $Rm", vpred_n, "$Qd = $Qd_src", vecsize, pattern> {
   bits<4> Qd;
   bits<4> Rm;
 
@@ -5206,7 +5217,7 @@ multiclass MVE_vec_scalar_int_pat_m<Instruction inst, MVEVectorVTInfo VTI,
 
 class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
                      bit bit_5, bit bit_12, bit bit_16, bit bit_28>
-  : MVE_qDest_rSrc<iname, suffix, ""> {
+  : MVE_qDest_rSrc<iname, suffix, "", size> {
 
   let Inst{28} = bit_28;
   let Inst{21-20} = size;
@@ -5274,7 +5285,7 @@ defm MVE_VQSUB_qr_u32 : MVE_VQSUB_qr_m<MVE_v4u32, usubsat>;
 
 class MVE_VQDMULL_qr<string iname, string suffix, bit size,
                      bit T, string cstr="", list<dag> pattern=[]>
-  : MVE_qDest_rSrc<iname, suffix, cstr, pattern> {
+  : MVE_qDest_rSrc<iname, suffix, cstr, !if(size, 0b10, 0b01), pattern> {
 
   let Inst{28} = size;
   let Inst{21-20} = 0b11;
@@ -5319,12 +5330,12 @@ defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<MVE_v8s16, 0b0>;
 defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<MVE_v4s32, 0b1, "@earlyclobber $Qd">;
 
 class MVE_VxADDSUB_qr<string iname, string suffix,
-                      bit bit_28, bits<2> bits_21_20, bit subtract,
-                      list<dag> pattern=[]>
-  : MVE_qDest_rSrc<iname, suffix, "", pattern> {
+                      bit bit_28, bits<2> size, bit subtract,
+                      bits<2> vecsize, list<dag> pattern=[]>
+  : MVE_qDest_rSrc<iname, suffix, "", vecsize, pattern> {
 
   let Inst{28} = bit_28;
-  let Inst{21-20} = bits_21_20;
+  let Inst{21-20} = size;
   let Inst{16} = 0b0;
   let Inst{12} = subtract;
   let Inst{8} = 0b1;
@@ -5334,7 +5345,7 @@ class MVE_VxADDSUB_qr<string iname, string suffix,
 
 multiclass MVE_VHADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
                              Intrinsic unpred_int, Intrinsic pred_int> {
-  def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, subtract>;
+  def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, subtract, VTI.Size>;
   defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME),
                                   VTI, unpred_int, pred_int, 1, 1>;
 }
@@ -5363,7 +5374,7 @@ defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32>;
 
 multiclass MVE_VADDSUB_qr_f<string iname, MVEVectorVTInfo VTI, bit subtract,
                             SDNode Op, Intrinsic PredInt> {
-  def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, subtract>;
+  def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, subtract, VTI.Size>;
   defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? ),
                               !cast<Instruction>(NAME)>;
 }
@@ -5382,7 +5393,7 @@ let Predicates = [HasMVEFloat] in {
 
 class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size,
                    bit bit_7, bit bit_17, list<dag> pattern=[]>
-  : MVE_qDest_single_rSrc<iname, suffix, pattern> {
+  : MVE_qDest_single_rSrc<iname, suffix, size, pattern> {
 
   let Inst{28} = U;
   let Inst{25-23} = 0b100;
@@ -5444,7 +5455,7 @@ let Predicates = [HasMVEInt] in {
 }
 
 class MVE_VBRSR<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
-  : MVE_qDest_rSrc<iname, suffix, "", pattern> {
+  : MVE_qDest_rSrc<iname, suffix, "", size, pattern> {
 
   let Inst{28} = 0b1;
   let Inst{21-20} = size;
@@ -5494,7 +5505,7 @@ let Predicates = [HasMVEFloat] in {
 }
 
 class MVE_VMUL_qr_int<string iname, string suffix, bits<2> size>
-  : MVE_qDest_rSrc<iname, suffix, ""> {
+  : MVE_qDest_rSrc<iname, suffix, "", size> {
 
   let Inst{28} = 0b0;
   let Inst{21-20} = size;
@@ -5518,11 +5529,11 @@ defm MVE_VMUL_qr_i16 : MVE_VMUL_qr_int_m<MVE_v8i16>;
 defm MVE_VMUL_qr_i32 : MVE_VMUL_qr_int_m<MVE_v4i32>;
 
 class MVE_VxxMUL_qr<string iname, string suffix,
-                    bit bit_28, bits<2> bits_21_20, list<dag> pattern=[]>
-  : MVE_qDest_rSrc<iname, suffix, "", pattern> {
+                    bit bit_28, bits<2> size, bits<2> vecsize, list<dag> pattern=[]>
+  : MVE_qDest_rSrc<iname, suffix, "", vecsize, pattern> {
 
   let Inst{28} = bit_28;
-  let Inst{21-20} = bits_21_20;
+  let Inst{21-20} = size;
   let Inst{16} = 0b1;
   let Inst{12} = 0b0;
   let Inst{8} = 0b0;
@@ -5532,7 +5543,7 @@ class MVE_VxxMUL_qr<string iname, string suffix,
 
 multiclass MVE_VxxMUL_qr_m<string iname, MVEVectorVTInfo VTI, bit bit_28,
                            PatFrag Op, Intrinsic int_unpred, Intrinsic int_pred> {
-  def "" : MVE_VxxMUL_qr<iname, VTI.Suffix, bit_28, VTI.Size>;
+  def "" : MVE_VxxMUL_qr<iname, VTI.Suffix, bit_28, VTI.Size, VTI.Size>;
 
   let Predicates = [HasMVEInt] in {
     defm : MVE_TwoOpPatternDup<VTI, Op, int_pred, (? ), !cast<Instruction>(NAME)>;
@@ -5558,7 +5569,7 @@ defm MVE_VQRDMULH_qr_s32  : MVE_VQRDMULH_qr_m<MVE_v4s32>;
 
 multiclass MVE_VxxMUL_qr_f_m<MVEVectorVTInfo VTI> {
   let validForTailPredication = 1 in
-  def "" : MVE_VxxMUL_qr<"vmul", VTI.Suffix, VTI.Size{0}, 0b11>;
+  def "" : MVE_VxxMUL_qr<"vmul", VTI.Suffix, VTI.Size{0}, 0b11, VTI.Size>;
   defm : MVE_TwoOpPatternDup<VTI, fmul, int_arm_mve_mul_predicated, (? ),
                              !cast<Instruction>(NAME)>;
 }
@@ -5570,8 +5581,8 @@ let Predicates = [HasMVEFloat] in {
 
 class MVE_VFMAMLA_qr<string iname, string suffix,
                      bit bit_28, bits<2> bits_21_20, bit S,
-                     list<dag> pattern=[]>
-  : MVE_qDestSrc_rSrc<iname, suffix, pattern> {
+                     bits<2> vecsize, list<dag> pattern=[]>
+  : MVE_qDestSrc_rSrc<iname, suffix, vecsize, pattern> {
 
   let Inst{28} = bit_28;
   let Inst{21-20} = bits_21_20;
@@ -5586,7 +5597,7 @@ class MVE_VFMAMLA_qr<string iname, string suffix,
 multiclass MVE_VMLA_qr_multi<string iname, MVEVectorVTInfo VTI,
                              bit scalar_addend> {
   def "": MVE_VFMAMLA_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size,
-                         scalar_addend>;
+                         scalar_addend, VTI.Size>;
   defvar Inst = !cast<Instruction>(NAME);
   defvar pred_int = !cast<Intrinsic>("int_arm_mve_" # iname # "_n_predicated");
   defvar v1   = (VTI.Vec MQPR:$v1);
@@ -5628,7 +5639,7 @@ defm MVE_VMLAS_qr_u32 : MVE_VMLA_qr_multi<"vmlas", MVE_v4u32, 0b1>;
 
 multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI,
                              bit scalar_addend> {
-  def "": MVE_VFMAMLA_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, scalar_addend>;
+  def "": MVE_VFMAMLA_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, scalar_addend, VTI.Size>;
   defvar Inst = !cast<Instruction>(NAME);
   defvar pred_int = int_arm_mve_fma_predicated;
   defvar v1   = (VTI.Vec MQPR:$v1);
@@ -5677,7 +5688,7 @@ let Predicates = [HasMVEFloat] in {
 
 class MVE_VQDMLAH_qr<string iname, string suffix, bit U, bits<2> size,
                      bit bit_5, bit bit_12, list<dag> pattern=[]>
-  : MVE_qDestSrc_rSrc<iname, suffix, pattern> {
+  : MVE_qDestSrc_rSrc<iname, suffix, size, pattern> {
 
   let Inst{28} = U;
   let Inst{21-20} = size;
@@ -5722,7 +5733,7 @@ class MVE_VxDUP<string iname, string suffix, bits<2> size, bit bit_12,
                 ValueType VT, SDPatternOperator vxdup>
   : MVE_p<(outs MQPR:$Qd, tGPREven:$Rn),
           (ins tGPREven:$Rn_src, MVE_VIDUP_imm:$imm), NoItinerary,
-          iname, suffix, "$Qd, $Rn, $imm", vpred_r, "$Rn = $Rn_src",
+          iname, suffix, "$Qd, $Rn, $imm", vpred_r, "$Rn = $Rn_src", size,
           [(set (VT MQPR:$Qd), (i32 tGPREven:$Rn),
               (vxdup (i32 tGPREven:$Rn_src), (i32 imm:$imm)))]> {
   bits<4> Qd;
@@ -5757,7 +5768,7 @@ class MVE_VxWDUP<string iname, string suffix, bits<2> size, bit bit_12,
                  list<dag> pattern=[]>
   : MVE_p<(outs MQPR:$Qd, tGPREven:$Rn),
           (ins tGPREven:$Rn_src, tGPROdd:$Rm, MVE_VIDUP_imm:$imm), NoItinerary,
-          iname, suffix, "$Qd, $Rn, $Rm, $imm", vpred_r, "$Rn = $Rn_src",
+          iname, suffix, "$Qd, $Rn, $Rm, $imm", vpred_r, "$Rn = $Rn_src", size,
           pattern> {
   bits<4> Qd;
   bits<4> Rm;
@@ -5792,7 +5803,7 @@ def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>;
 let isReMaterializable = 1 in
 class MVE_VCTPInst<string suffix, bits<2> size, list<dag> pattern=[]>
   : MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix,
-          "$Rn", vpred_n, "", pattern> {
+          "$Rn", vpred_n, "", size, pattern> {
   bits<4> Rn;
 
   let Inst{28-27} = 0b10;
@@ -5849,6 +5860,7 @@ class MVE_VMOV_64bit<dag oops, dag iops, bit to_qreg, string ops, string cstr>
   let Inst{4} = idx2;
   let Inst{3-0} = Rt{3-0};
 
+  let VecSize = 0b10;
   let hasSideEffects = 0;
 }
 
@@ -5937,7 +5949,7 @@ class MVE_vldst24_base<bit writeback, bit fourregs, bits<2> stage, bits<2> size,
                        bit load, dag Oops, dag loadIops, dag wbIops,
                        string iname, string ops,
                        string cstr, list<dag> pattern=[]>
-  : MVE_MI<Oops, !con(loadIops, wbIops), NoItinerary, iname, ops, cstr, pattern> {
+  : MVE_MI<Oops, !con(loadIops, wbIops), NoItinerary, iname, ops, cstr, size, pattern> {
   bits<4> VQd;
   bits<4> Rn;
 
@@ -6135,8 +6147,8 @@ def MVE_memD: MVE_memsz<0b11, 3, ?,               "d", ["", "u", "s", "f"]>;
 // input values.
 class MVE_VLDRSTR_base<MVE_ldst_direction dir, bit U, bit P, bit W, bit opc,
                        dag oops, dag iops, string asm, string suffix,
-                       string ops, string cstr, list<dag> pattern=[]>
- : MVE_p<oops, iops, NoItinerary, asm, suffix, ops, vpred_n, cstr, pattern> {
+                       string ops, string cstr, bits<2> vecsize, list<dag> pattern=[]>
+ : MVE_p<oops, iops, NoItinerary, asm, suffix, ops, vpred_n, cstr, vecsize, pattern> {
   bits<3> Qd;
 
   let Inst{28} = U;
@@ -6172,7 +6184,7 @@ class MVE_VLDRSTR_base<MVE_ldst_direction dir, bit U, bit P, bit W, bit opc,
 class MVE_VLDRSTR_cs<MVE_ldst_direction dir, MVE_memsz memsz, bit P, bit W,
                      dag oops, dag iops, string asm, string suffix,
                      IndexMode im, string ops, string cstr>
-  : MVE_VLDRSTR_base<dir, 0, P, W, 1, oops, iops, asm, suffix, ops, cstr> {
+  : MVE_VLDRSTR_base<dir, 0, P, W, 1, oops, iops, asm, suffix, ops, cstr, memsz.encoding> {
   bits<12> addr;
   let Inst{23} = addr{7};
   let Inst{19-16} = addr{11-8};
@@ -6187,7 +6199,7 @@ class MVE_VLDRSTR_cw<MVE_ldst_direction dir, MVE_memsz memsz, bit U,
                      bit P, bit W, bits<2> size, dag oops, dag iops,
                      string asm, string suffix, IndexMode im,
                      string ops, string cstr>
-  : MVE_VLDRSTR_base<dir, U, P, W, 0, oops, iops, asm, suffix, ops, cstr> {
+  : MVE_VLDRSTR_base<dir, U, P, W, 0, oops, iops, asm, suffix, ops, cstr, size> {
   bits<11> addr;
   let Inst{23} = addr{7};
   let Inst{19} = memsz.encoding{0}; // enough to tell 16- from 32-bit
@@ -6304,7 +6316,7 @@ class MVE_VLDRSTR_rq<MVE_ldst_direction dir, MVE_memsz memsz, bit U,
                      bits<2> size, bit os, string asm, string suffix, int shift>
   : MVE_VLDRSTR_base<dir, U, 0b0, 0b0, 0, dir.Oops,
                      !con(dir.Iops, (ins mve_addr_rq_shift<shift>:$addr)),
-                     asm, suffix, "$Qd, $addr", dir.cstr> {
+                     asm, suffix, "$Qd, $addr", dir.cstr, size> {
   bits<7> addr;
   let Inst{23} = 0b1;
   let Inst{19-16} = addr{6-3};
@@ -6437,7 +6449,7 @@ class MVE_VLDRSTR_qi<MVE_ldst_direction dir, MVE_memsz memsz, bit W, dag wbops,
                      string asm, string wbAsm, string suffix, string cstr = "">
   : MVE_VLDRSTR_base<dir, 1, 1, W, 1, !con(wbops, dir.Oops),
                      !con(dir.Iops, (ins mve_addr_q_shift<memsz.shift>:$addr)),
-                     asm, suffix, "$Qd, $addr" # wbAsm, cstr # dir.cstr> {
+                     asm, suffix, "$Qd, $addr" # wbAsm, cstr # dir.cstr, memsz.encoding> {
   bits<11> addr;
   let Inst{23} = addr{7};
   let Inst{19-17} = addr{10-8};
@@ -6546,7 +6558,7 @@ foreach suffix = memsz.suffixes in {
 // end of MVE predicable load/store
 
 class MVE_VPT<string suffix, bits<2> size, dag iops, string asm, list<dag> pattern=[]>
-  : MVE_MI<(outs ), iops, NoItinerary, !strconcat("vpt", "${Mk}", ".", suffix), asm, "", pattern> {
+  : MVE_MI<(outs ), iops, NoItinerary, !strconcat("vpt", "${Mk}", ".", suffix), asm, "", size, pattern> {
   bits<3> fc;
   bits<4> Mk;
   bits<3> Qn;
@@ -6656,7 +6668,7 @@ def MVE_VPTv16s8r : MVE_VPTt2s<"s8", 0b00>;
 
 class MVE_VPTf<string suffix, bit size, dag iops, string asm, list<dag> pattern=[]>
   : MVE_MI<(outs ), iops, NoItinerary, !strconcat("vpt", "${Mk}", ".", suffix), asm,
-            "", pattern> {
+            "", !if(size, 0b01, 0b10), pattern> {
   bits<3> fc;
   bits<4> Mk;
   bits<3> Qn;
@@ -6709,7 +6721,7 @@ def MVE_VPTv4f32r        : MVE_VPTft2<"f32", 0b0>;
 def MVE_VPTv8f16r        : MVE_VPTft2<"f16", 0b1>;
 
 def MVE_VPST : MVE_MI<(outs ), (ins vpt_mask:$Mk), NoItinerary,
-       !strconcat("vpst", "${Mk}"), "", "", []> {
+       !strconcat("vpst", "${Mk}"), "", "", 0b00, []> {
   bits<4> Mk;
 
   let Inst{31-23} = 0b111111100;
@@ -6726,7 +6738,7 @@ def MVE_VPST : MVE_MI<(outs ), (ins vpt_mask:$Mk), NoItinerary,
 }
 
 def MVE_VPSEL : MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qn, MQPR:$Qm), NoItinerary,
-                      "vpsel", "", "$Qd, $Qn, $Qm", vpred_n, "", []> {
+                      "vpsel", "", "$Qd, $Qn, $Qm", vpred_n, "", 0b00, []> {
   bits<4> Qn;
   bits<4> Qd;
   bits<4> Qm;
@@ -6832,7 +6844,7 @@ let Predicates = [HasMVEFloat] in {
 }
 
 def MVE_VPNOT : MVE_p<(outs VCCR:$P0), (ins VCCR:$P0_in), NoItinerary,
-                      "vpnot", "", "", vpred_n, "", []> {
+                      "vpnot", "", "", vpred_n, "", 0b00, []> {
   let Inst{31-0} = 0b11111110001100010000111101001101;
   let Unpredictable{19-17} = 0b111;
   let Unpredictable{12} = 0b1;
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 579e7d4..23e87e1 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -97,7 +97,15 @@ static bool isDomainMVE(MachineInstr *MI) {
   return Domain == ARMII::DomainMVE;
 }
 
+static int getVecSize(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  uint64_t Flags = MCID.TSFlags;
+  return (Flags & ARMII::VecSize) >> ARMII::VecSizeShift;
+}
+
 static bool shouldInspect(MachineInstr &MI) {
+  if (MI.isDebugInstr())
+    return false;
   return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
 }
 
@@ -371,6 +379,7 @@ namespace {
     SmallVector<MachineInstr*, 4> VCTPs;
     SmallPtrSet<MachineInstr*, 4> ToRemove;
     SmallPtrSet<MachineInstr*, 4> BlockMasksToRecompute;
+    SmallPtrSet<MachineInstr*, 4> DoubleWidthResultInstrs;
     bool Revert = false;
     bool CannotTailPredicate = false;
 
@@ -730,6 +739,20 @@ bool LowOverheadLoop::ValidateTailPredicate() {
     return false;
   }
 
+  // For any DoubleWidthResultInstrs we found whilst scanning instructions, they
+  // need to compute an output size that is smaller than the VCTP mask operates
+  // on. The VecSize of the DoubleWidthResult is the larger vector size - the
+  // size it extends into, so any VCTP VecSize <= is valid.
+  unsigned VCTPVecSize = getVecSize(*VCTP);
+  for (MachineInstr *MI : DoubleWidthResultInstrs) {
+    unsigned InstrVecSize = getVecSize(*MI);
+    if (InstrVecSize > VCTPVecSize) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Double width result larger than VCTP "
+                        << "VecSize:\n" << *MI);
+      return false;
+    }
+  }
+
   // Check that the value change of the element count is what we expect and
   // that the predication will be equivalent. For this we need:
   // NumElements = NumElements - VectorWidth. The sub will be a sub immediate
@@ -1233,8 +1256,13 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) {
   bool RequiresExplicitPredication =
     (MCID.TSFlags & ARMII::ValidForTailPredication) == 0;
   if (isDomainMVE(MI) && RequiresExplicitPredication) {
-    LLVM_DEBUG(if (!IsUse)
-               dbgs() << "ARM Loops: Can't tail predicate: " << *MI);
+    if (!IsUse && producesDoubleWidthResult(*MI)) {
+      DoubleWidthResultInstrs.insert(MI);
+      return true;
+    }
+
+    LLVM_DEBUG(if (!IsUse) dbgs()
+               << "ARM Loops: Can't tail predicate: " << *MI);
     return IsUse;
   }
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index ecd9611..43f7575 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -408,6 +408,14 @@ namespace ARMII {
     // its input, typically reading from the top/bottom halves of the input(s).
     DoubleWidthResult = 1 << 23,
 
+    // The vector element size for MVE instructions. 00 = i8, 01 = i16, 10 = i32
+    // and 11 = i64. This is the largest type if multiple are present, so a
+    // MVE_VMOVLs8bh is ize 01=i16, as it extends from a i8 to a i16. There are
+    // some caveats so cannot be used blindly, such as exchanging VMLADAVA's and
+    // complex instructions, which may use different input lanes.
+    VecSizeShift = 24,
+    VecSize = 3 << VecSizeShift,
+
     //===------------------------------------------------------------------===//
     // Code domain.
     DomainShift   = 15,
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovlloop.ll b/llvm/test/CodeGen/Thumb2/mve-vmovlloop.ll
index 1eff548..5ddbb3b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmovlloop.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmovlloop.ll
@@ -10,22 +10,13 @@ define void @vmovl_s32(i32* noalias nocapture %d, i32* nocapture readonly %s, i3
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r7, pc}
 ; CHECK-NEXT:  .LBB0_1: @ %vector.ph
-; CHECK-NEXT:    adds r3, r2, #3
-; CHECK-NEXT:    bic r3, r3, #3
-; CHECK-NEXT:    sub.w r12, r3, #4
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
-; CHECK-NEXT:    dls lr, r3
+; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vctp.32 r2
-; CHECK-NEXT:    subs r2, #4
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrwt.u32 q0, [r1], #16
+; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrwt.32 q0, [r0], #16
-; CHECK-NEXT:    le lr, .LBB0_2
+; CHECK-NEXT:    vstrw.32 q0, [r0], #16
+; CHECK-NEXT:    letp lr, .LBB0_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -66,22 +57,13 @@ define void @vmovl_u16(i16* noalias nocapture %d, i16* nocapture readonly %s, i3
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r7, pc}
 ; CHECK-NEXT:  .LBB1_1: @ %vector.ph
-; CHECK-NEXT:    adds r3, r2, #7
-; CHECK-NEXT:    bic r3, r3, #7
-; CHECK-NEXT:    sub.w r12, r3, #8
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w r3, r3, r12, lsr #3
-; CHECK-NEXT:    dls lr, r3
+; CHECK-NEXT:    dlstp.16 lr, r2
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vctp.16 r2
-; CHECK-NEXT:    subs r2, #8
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrht.u16 q0, [r1], #16
+; CHECK-NEXT:    vldrh.u16 q0, [r1], #16
 ; CHECK-NEXT:    vmovlb.u8 q0, q0
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrht.16 q0, [r0], #16
-; CHECK-NEXT:    le lr, .LBB1_2
+; CHECK-NEXT:    vstrh.16 q0, [r0], #16
+; CHECK-NEXT:    letp lr, .LBB1_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -174,35 +156,22 @@ define void @sunken_vmovl(i8* noalias %pTarget, i16 signext %iTargetStride, i8*
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    ldrsh.w r1, [sp, #8]
 ; CHECK-NEXT:    vmov.i16 q0, #0x100
-; CHECK-NEXT:    cmp r1, #8
-; CHECK-NEXT:    mov r3, r1
-; CHECK-NEXT:    it ge
-; CHECK-NEXT:    movge r3, #8
 ; CHECK-NEXT:    vldrb.u16 q1, [r2], #8
-; CHECK-NEXT:    subs r3, r1, r3
 ; CHECK-NEXT:    vldrb.u16 q2, [r0], #8
-; CHECK-NEXT:    add.w r12, r3, #7
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w r12, r3, r12, lsr #3
 ; CHECK-NEXT:    ldr r3, [sp, #12]
-; CHECK-NEXT:    dls lr, r12
+; CHECK-NEXT:    dlstp.16 lr, r1
 ; CHECK-NEXT:  .LBB3_1: @ %do.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vctp.16 r1
-; CHECK-NEXT:    subs r1, #8
 ; CHECK-NEXT:    vmovlb.u8 q1, q1
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vsubt.i16 q3, q0, q1
+; CHECK-NEXT:    vsub.i16 q3, q0, q1
 ; CHECK-NEXT:    vmovlb.u8 q2, q2
-; CHECK-NEXT:    vpstttt
-; CHECK-NEXT:    vmult.i16 q3, q2, q3
-; CHECK-NEXT:    vmlat.u16 q3, q1, r3
-; CHECK-NEXT:    vshrt.u16 q3, q3, #8
-; CHECK-NEXT:    vldrbt.u16 q1, [r2], #8
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrbt.u16 q2, [r0], #8
-; CHECK-NEXT:    vstrbt.16 q3, [r0, #-16]
-; CHECK-NEXT:    le lr, .LBB3_1
+; CHECK-NEXT:    vmul.i16 q3, q2, q3
+; CHECK-NEXT:    vmla.u16 q3, q1, r3
+; CHECK-NEXT:    vshr.u16 q3, q3, #8
+; CHECK-NEXT:    vldrb.u16 q1, [r2], #8
+; CHECK-NEXT:    vldrb.u16 q2, [r0], #8
+; CHECK-NEXT:    vstrb.16 q3, [r0, #-16]
+; CHECK-NEXT:    letp lr, .LBB3_1
 ; CHECK-NEXT:  @ %bb.2: @ %do.end
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
diff --git a/llvm/unittests/Target/ARM/MachineInstrTest.cpp b/llvm/unittests/Target/ARM/MachineInstrTest.cpp
index b9cfd9c..4113b79 100644
--- a/llvm/unittests/Target/ARM/MachineInstrTest.cpp
+++ b/llvm/unittests/Target/ARM/MachineInstrTest.cpp
@@ -1216,3 +1216,886 @@ TEST(MachineInstr, HasSideEffects) {
         << MII->getName(Op) << " has unexpected side effects";
   }
 }
+
+TEST(MachineInstr, MVEVecSize) {
+  using namespace ARM;
+  auto MVEVecSize = [](unsigned Opcode) {
+    switch (Opcode) {
+    default:
+    dbgs() << Opcode << "\n";
+      llvm_unreachable("Unexpected MVE instruction!");
+    case MVE_ASRLi:
+    case MVE_ASRLr:
+    case MVE_LSLLi:
+    case MVE_LSLLr:
+    case MVE_LSRL:
+    case MVE_SQRSHR:
+    case MVE_SQRSHRL:
+    case MVE_SQSHL:
+    case MVE_SQSHLL:
+    case MVE_SRSHR:
+    case MVE_SRSHRL:
+    case MVE_UQRSHL:
+    case MVE_UQRSHLL:
+    case MVE_UQSHL:
+    case MVE_UQSHLL:
+    case MVE_URSHR:
+    case MVE_URSHRL:
+    case MVE_VABAVs8:
+    case MVE_VABAVu8:
+    case MVE_VABDs8:
+    case MVE_VABDu8:
+    case MVE_VABSs8:
+    case MVE_VADDVs8acc:
+    case MVE_VADDVs8no_acc:
+    case MVE_VADDVu8acc:
+    case MVE_VADDVu8no_acc:
+    case MVE_VADD_qr_i8:
+    case MVE_VADDi8:
+    case MVE_VBRSR8:
+    case MVE_VCADDi8:
+    case MVE_VCLSs8:
+    case MVE_VCLZs8:
+    case MVE_VCMPi8:
+    case MVE_VCMPi8r:
+    case MVE_VCMPs8:
+    case MVE_VCMPs8r:
+    case MVE_VCMPu8:
+    case MVE_VCMPu8r:
+    case MVE_VCTP8:
+    case MVE_VDDUPu8:
+    case MVE_VDUP8:
+    case MVE_VDWDUPu8:
+    case MVE_VHADD_qr_s8:
+    case MVE_VHADD_qr_u8:
+    case MVE_VHADDs8:
+    case MVE_VHADDu8:
+    case MVE_VHCADDs8:
+    case MVE_VHSUB_qr_s8:
+    case MVE_VHSUB_qr_u8:
+    case MVE_VHSUBs8:
+    case MVE_VHSUBu8:
+    case MVE_VIDUPu8:
+    case MVE_VIWDUPu8:
+    case MVE_VLD20_8:
+    case MVE_VLD20_8_wb:
+    case MVE_VLD21_8:
+    case MVE_VLD21_8_wb:
+    case MVE_VLD40_8:
+    case MVE_VLD40_8_wb:
+    case MVE_VLD41_8:
+    case MVE_VLD41_8_wb:
+    case MVE_VLD42_8:
+    case MVE_VLD42_8_wb:
+    case MVE_VLD43_8:
+    case MVE_VLD43_8_wb:
+    case MVE_VLDRBU8:
+    case MVE_VLDRBU8_post:
+    case MVE_VLDRBU8_pre:
+    case MVE_VLDRBU8_rq:
+    case MVE_VMAXAVs8:
+    case MVE_VMAXAs8:
+    case MVE_VMAXVs8:
+    case MVE_VMAXVu8:
+    case MVE_VMAXs8:
+    case MVE_VMAXu8:
+    case MVE_VMINAVs8:
+    case MVE_VMINAs8:
+    case MVE_VMINVs8:
+    case MVE_VMINVu8:
+    case MVE_VMINs8:
+    case MVE_VMINu8:
+    case MVE_VMLADAVas8:
+    case MVE_VMLADAVau8:
+    case MVE_VMLADAVaxs8:
+    case MVE_VMLADAVs8:
+    case MVE_VMLADAVu8:
+    case MVE_VMLADAVxs8:
+    case MVE_VMLAS_qr_s8:
+    case MVE_VMLAS_qr_u8:
+    case MVE_VMLA_qr_s8:
+    case MVE_VMLA_qr_u8:
+    case MVE_VMLSDAVas8:
+    case MVE_VMLSDAVaxs8:
+    case MVE_VMLSDAVs8:
+    case MVE_VMLSDAVxs8:
+    case MVE_VMOV_from_lane_s8:
+    case MVE_VMOV_from_lane_u8:
+    case MVE_VMOV_to_lane_8:
+    case MVE_VMOVimmi8:
+    case MVE_VMULHs8:
+    case MVE_VMULHu8:
+    case MVE_VMUL_qr_i8:
+    case MVE_VMULi8:
+    case MVE_VNEGs8:
+    case MVE_VPTv16i8:
+    case MVE_VPTv16i8r:
+    case MVE_VPTv16s8:
+    case MVE_VPTv16s8r:
+    case MVE_VPTv16u8:
+    case MVE_VPTv16u8r:
+    case MVE_VQABSs8:
+    case MVE_VQADD_qr_s8:
+    case MVE_VQADD_qr_u8:
+    case MVE_VQADDs8:
+    case MVE_VQADDu8:
+    case MVE_VQDMLADHXs8:
+    case MVE_VQDMLADHs8:
+    case MVE_VQDMLAH_qrs8:
+    case MVE_VQDMLASH_qrs8:
+    case MVE_VQDMLSDHXs8:
+    case MVE_VQDMLSDHs8:
+    case MVE_VQDMULH_qr_s8:
+    case MVE_VQDMULHi8:
+    case MVE_VQNEGs8:
+    case MVE_VQRDMLADHXs8:
+    case MVE_VQRDMLADHs8:
+    case MVE_VQRDMLAH_qrs8:
+    case MVE_VQRDMLASH_qrs8:
+    case MVE_VQRDMLSDHXs8:
+    case MVE_VQRDMLSDHs8:
+    case MVE_VQRDMULH_qr_s8:
+    case MVE_VQRDMULHi8:
+    case MVE_VQRSHL_by_vecs8:
+    case MVE_VQRSHL_by_vecu8:
+    case MVE_VQRSHL_qrs8:
+    case MVE_VQRSHL_qru8:
+    case MVE_VQSHLU_imms8:
+    case MVE_VQSHL_by_vecs8:
+    case MVE_VQSHL_by_vecu8:
+    case MVE_VQSHL_qrs8:
+    case MVE_VQSHL_qru8:
+    case MVE_VQSHLimms8:
+    case MVE_VQSHLimmu8:
+    case MVE_VQSUB_qr_s8:
+    case MVE_VQSUB_qr_u8:
+    case MVE_VQSUBs8:
+    case MVE_VQSUBu8:
+    case MVE_VRHADDs8:
+    case MVE_VRHADDu8:
+    case MVE_VRMULHs8:
+    case MVE_VRMULHu8:
+    case MVE_VRSHL_by_vecs8:
+    case MVE_VRSHL_by_vecu8:
+    case MVE_VRSHL_qrs8:
+    case MVE_VRSHL_qru8:
+    case MVE_VRSHR_imms8:
+    case MVE_VRSHR_immu8:
+    case MVE_VSHL_by_vecs8:
+    case MVE_VSHL_by_vecu8:
+    case MVE_VSHL_immi8:
+    case MVE_VSHL_qru8:
+    case MVE_VSHL_qrs8:
+    case MVE_VSHR_imms8:
+    case MVE_VSHR_immu8:
+    case MVE_VSLIimm8:
+    case MVE_VSRIimm8:
+    case MVE_VST20_8:
+    case MVE_VST20_8_wb:
+    case MVE_VST21_8:
+    case MVE_VST21_8_wb:
+    case MVE_VST40_8:
+    case MVE_VST40_8_wb:
+    case MVE_VST41_8:
+    case MVE_VST41_8_wb:
+    case MVE_VST42_8:
+    case MVE_VST42_8_wb:
+    case MVE_VST43_8:
+    case MVE_VST43_8_wb:
+    case MVE_VSTRB8_rq:
+    case MVE_VSTRBU8:
+    case MVE_VSTRBU8_post:
+    case MVE_VSTRBU8_pre:
+    case MVE_VSUB_qr_i8:
+    case MVE_VSUBi8:
+    case MVE_VAND:
+    case MVE_VBIC:
+    case MVE_VEOR:
+    case MVE_VMVN:
+    case MVE_VORN:
+    case MVE_VORR:
+    case MVE_VPNOT:
+    case MVE_VPSEL:
+    case MVE_VPST:
+      return 0;
+    case MVE_VABAVs16:
+    case MVE_VABAVu16:
+    case MVE_VABDf16:
+    case MVE_VABDs16:
+    case MVE_VABDu16:
+    case MVE_VABSf16:
+    case MVE_VABSs16:
+    case MVE_VADDVs16acc:
+    case MVE_VADDVs16no_acc:
+    case MVE_VADDVu16acc:
+    case MVE_VADDVu16no_acc:
+    case MVE_VADD_qr_f16:
+    case MVE_VADD_qr_i16:
+    case MVE_VADDf16:
+    case MVE_VADDi16:
+    case MVE_VBICimmi16:
+    case MVE_VBRSR16:
+    case MVE_VCADDf16:
+    case MVE_VCADDi16:
+    case MVE_VCLSs16:
+    case MVE_VCLZs16:
+    case MVE_VCMLAf16:
+    case MVE_VCMPf16:
+    case MVE_VCMPf16r:
+    case MVE_VCMPi16:
+    case MVE_VCMPi16r:
+    case MVE_VCMPs16:
+    case MVE_VCMPs16r:
+    case MVE_VCMPu16:
+    case MVE_VCMPu16r:
+    case MVE_VCMULf16:
+    case MVE_VCTP16:
+    case MVE_VCVTf16s16_fix:
+    case MVE_VCVTf16s16n:
+    case MVE_VCVTf16u16_fix:
+    case MVE_VCVTf16u16n:
+    case MVE_VCVTs16f16_fix:
+    case MVE_VCVTs16f16a:
+    case MVE_VCVTs16f16m:
+    case MVE_VCVTs16f16n:
+    case MVE_VCVTs16f16p:
+    case MVE_VCVTs16f16z:
+    case MVE_VCVTu16f16_fix:
+    case MVE_VCVTu16f16a:
+    case MVE_VCVTu16f16m:
+    case MVE_VCVTu16f16n:
+    case MVE_VCVTu16f16p:
+    case MVE_VCVTu16f16z:
+    case MVE_VDDUPu16:
+    case MVE_VDUP16:
+    case MVE_VDWDUPu16:
+    case MVE_VFMA_qr_Sf16:
+    case MVE_VFMA_qr_f16:
+    case MVE_VFMAf16:
+    case MVE_VFMSf16:
+    case MVE_VHADD_qr_s16:
+    case MVE_VHADD_qr_u16:
+    case MVE_VHADDs16:
+    case MVE_VHADDu16:
+    case MVE_VHCADDs16:
+    case MVE_VHSUB_qr_s16:
+    case MVE_VHSUB_qr_u16:
+    case MVE_VHSUBs16:
+    case MVE_VHSUBu16:
+    case MVE_VIDUPu16:
+    case MVE_VIWDUPu16:
+    case MVE_VLD20_16:
+    case MVE_VLD20_16_wb:
+    case MVE_VLD21_16:
+    case MVE_VLD21_16_wb:
+    case MVE_VLD40_16:
+    case MVE_VLD40_16_wb:
+    case MVE_VLD41_16:
+    case MVE_VLD41_16_wb:
+    case MVE_VLD42_16:
+    case MVE_VLD42_16_wb:
+    case MVE_VLD43_16:
+    case MVE_VLD43_16_wb:
+    case MVE_VLDRBS16:
+    case MVE_VLDRBS16_post:
+    case MVE_VLDRBS16_pre:
+    case MVE_VLDRBS16_rq:
+    case MVE_VLDRBU16:
+    case MVE_VLDRBU16_post:
+    case MVE_VLDRBU16_pre:
+    case MVE_VLDRBU16_rq:
+    case MVE_VLDRHU16:
+    case MVE_VLDRHU16_post:
+    case MVE_VLDRHU16_pre:
+    case MVE_VLDRHU16_rq:
+    case MVE_VLDRHU16_rq_u:
+    case MVE_VMAXAVs16:
+    case MVE_VMAXAs16:
+    case MVE_VMAXNMAVf16:
+    case MVE_VMAXNMAf16:
+    case MVE_VMAXNMVf16:
+    case MVE_VMAXNMf16:
+    case MVE_VMAXVs16:
+    case MVE_VMAXVu16:
+    case MVE_VMAXs16:
+    case MVE_VMAXu16:
+    case MVE_VMINAVs16:
+    case MVE_VMINAs16:
+    case MVE_VMINNMAVf16:
+    case MVE_VMINNMAf16:
+    case MVE_VMINNMVf16:
+    case MVE_VMINNMf16:
+    case MVE_VMINVs16:
+    case MVE_VMINVu16:
+    case MVE_VMINs16:
+    case MVE_VMINu16:
+    case MVE_VMLADAVas16:
+    case MVE_VMLADAVau16:
+    case MVE_VMLADAVaxs16:
+    case MVE_VMLADAVs16:
+    case MVE_VMLADAVu16:
+    case MVE_VMLADAVxs16:
+    case MVE_VMLALDAVas16:
+    case MVE_VMLALDAVau16:
+    case MVE_VMLALDAVaxs16:
+    case MVE_VMLALDAVs16:
+    case MVE_VMLALDAVu16:
+    case MVE_VMLALDAVxs16:
+    case MVE_VMLAS_qr_s16:
+    case MVE_VMLAS_qr_u16:
+    case MVE_VMLA_qr_s16:
+    case MVE_VMLA_qr_u16:
+    case MVE_VMLSDAVas16:
+    case MVE_VMLSDAVaxs16:
+    case MVE_VMLSDAVs16:
+    case MVE_VMLSDAVxs16:
+    case MVE_VMLSLDAVas16:
+    case MVE_VMLSLDAVaxs16:
+    case MVE_VMLSLDAVs16:
+    case MVE_VMLSLDAVxs16:
+    case MVE_VMOVNi16bh:
+    case MVE_VMOVNi16th:
+    case MVE_VMOV_from_lane_s16:
+    case MVE_VMOV_from_lane_u16:
+    case MVE_VMOV_to_lane_16:
+    case MVE_VMOVimmi16:
+    case MVE_VMOVLs8bh:
+    case MVE_VMOVLs8th:
+    case MVE_VMOVLu8bh:
+    case MVE_VMOVLu8th:
+    case MVE_VMULLBp8:
+    case MVE_VMULLBs8:
+    case MVE_VMULLBu8:
+    case MVE_VMULLTp8:
+    case MVE_VMULLTs8:
+    case MVE_VMULLTu8:
+    case MVE_VMULHs16:
+    case MVE_VMULHu16:
+    case MVE_VMUL_qr_f16:
+    case MVE_VMUL_qr_i16:
+    case MVE_VMULf16:
+    case MVE_VMULi16:
+    case MVE_VMVNimmi16:
+    case MVE_VNEGf16:
+    case MVE_VNEGs16:
+    case MVE_VORRimmi16:
+    case MVE_VPTv8f16:
+    case MVE_VPTv8f16r:
+    case MVE_VPTv8i16:
+    case MVE_VPTv8i16r:
+    case MVE_VPTv8s16:
+    case MVE_VPTv8s16r:
+    case MVE_VPTv8u16:
+    case MVE_VPTv8u16r:
+    case MVE_VQABSs16:
+    case MVE_VQADD_qr_s16:
+    case MVE_VQADD_qr_u16:
+    case MVE_VQADDs16:
+    case MVE_VQADDu16:
+    case MVE_VQDMLADHXs16:
+    case MVE_VQDMLADHs16:
+    case MVE_VQDMLAH_qrs16:
+    case MVE_VQDMLASH_qrs16:
+    case MVE_VQDMLSDHXs16:
+    case MVE_VQDMLSDHs16:
+    case MVE_VQDMULH_qr_s16:
+    case MVE_VQDMULHi16:
+    case MVE_VQDMULL_qr_s16bh:
+    case MVE_VQDMULL_qr_s16th:
+    case MVE_VQDMULLs16bh:
+    case MVE_VQDMULLs16th:
+    case MVE_VQMOVNs16bh:
+    case MVE_VQMOVNs16th:
+    case MVE_VQMOVNu16bh:
+    case MVE_VQMOVNu16th:
+    case MVE_VQMOVUNs16bh:
+    case MVE_VQMOVUNs16th:
+    case MVE_VQNEGs16:
+    case MVE_VQRDMLADHXs16:
+    case MVE_VQRDMLADHs16:
+    case MVE_VQRDMLAH_qrs16:
+    case MVE_VQRDMLASH_qrs16:
+    case MVE_VQRDMLSDHXs16:
+    case MVE_VQRDMLSDHs16:
+    case MVE_VQRDMULH_qr_s16:
+    case MVE_VQRDMULHi16:
+    case MVE_VQRSHL_by_vecs16:
+    case MVE_VQRSHL_by_vecu16:
+    case MVE_VQRSHL_qrs16:
+    case MVE_VQRSHL_qru16:
+    case MVE_VQRSHRNbhs16:
+    case MVE_VQRSHRNbhu16:
+    case MVE_VQRSHRNths16:
+    case MVE_VQRSHRNthu16:
+    case MVE_VQRSHRUNs16bh:
+    case MVE_VQRSHRUNs16th:
+    case MVE_VQSHLU_imms16:
+    case MVE_VQSHL_by_vecs16:
+    case MVE_VQSHL_by_vecu16:
+    case MVE_VQSHL_qrs16:
+    case MVE_VQSHL_qru16:
+    case MVE_VQSHLimms16:
+    case MVE_VQSHLimmu16:
+    case MVE_VQSHRNbhs16:
+    case MVE_VQSHRNbhu16:
+    case MVE_VQSHRNths16:
+    case MVE_VQSHRNthu16:
+    case MVE_VQSHRUNs16bh:
+    case MVE_VQSHRUNs16th:
+    case MVE_VQSUB_qr_s16:
+    case MVE_VQSUB_qr_u16:
+    case MVE_VQSUBs16:
+    case MVE_VQSUBu16:
+    case MVE_VREV16_8:
+    case MVE_VRHADDs16:
+    case MVE_VRHADDu16:
+    case MVE_VRINTf16A:
+    case MVE_VRINTf16M:
+    case MVE_VRINTf16N:
+    case MVE_VRINTf16P:
+    case MVE_VRINTf16X:
+    case MVE_VRINTf16Z:
+    case MVE_VRMULHs16:
+    case MVE_VRMULHu16:
+    case MVE_VRSHL_by_vecs16:
+    case MVE_VRSHL_by_vecu16:
+    case MVE_VRSHL_qrs16:
+    case MVE_VRSHL_qru16:
+    case MVE_VRSHRNi16bh:
+    case MVE_VRSHRNi16th:
+    case MVE_VRSHR_imms16:
+    case MVE_VRSHR_immu16:
+    case MVE_VSHLL_imms8bh:
+    case MVE_VSHLL_imms8th:
+    case MVE_VSHLL_immu8bh:
+    case MVE_VSHLL_immu8th:
+    case MVE_VSHLL_lws8bh:
+    case MVE_VSHLL_lws8th:
+    case MVE_VSHLL_lwu8bh:
+    case MVE_VSHLL_lwu8th:
+    case MVE_VSHL_by_vecs16:
+    case MVE_VSHL_by_vecu16:
+    case MVE_VSHL_immi16:
+    case MVE_VSHL_qrs16:
+    case MVE_VSHL_qru16:
+    case MVE_VSHRNi16bh:
+    case MVE_VSHRNi16th:
+    case MVE_VSHR_imms16:
+    case MVE_VSHR_immu16:
+    case MVE_VSLIimm16:
+    case MVE_VSRIimm16:
+    case MVE_VST20_16:
+    case MVE_VST20_16_wb:
+    case MVE_VST21_16:
+    case MVE_VST21_16_wb:
+    case MVE_VST40_16:
+    case MVE_VST40_16_wb:
+    case MVE_VST41_16:
+    case MVE_VST41_16_wb:
+    case MVE_VST42_16:
+    case MVE_VST42_16_wb:
+    case MVE_VST43_16:
+    case MVE_VST43_16_wb:
+    case MVE_VSTRB16:
+    case MVE_VSTRB16_post:
+    case MVE_VSTRB16_pre:
+    case MVE_VSTRB16_rq:
+    case MVE_VSTRH16_rq:
+    case MVE_VSTRH16_rq_u:
+    case MVE_VSTRHU16:
+    case MVE_VSTRHU16_post:
+    case MVE_VSTRHU16_pre:
+    case MVE_VSUB_qr_f16:
+    case MVE_VSUB_qr_i16:
+    case MVE_VSUBf16:
+    case MVE_VSUBi16:
+      return 1;
+    case MVE_VABAVs32:
+    case MVE_VABAVu32:
+    case MVE_VABDf32:
+    case MVE_VABDs32:
+    case MVE_VABDu32:
+    case MVE_VABSf32:
+    case MVE_VABSs32:
+    case MVE_VADC:
+    case MVE_VADCI:
+    case MVE_VADDLVs32acc:
+    case MVE_VADDLVs32no_acc:
+    case MVE_VADDLVu32acc:
+    case MVE_VADDLVu32no_acc:
+    case MVE_VADDVs32acc:
+    case MVE_VADDVs32no_acc:
+    case MVE_VADDVu32acc:
+    case MVE_VADDVu32no_acc:
+    case MVE_VADD_qr_f32:
+    case MVE_VADD_qr_i32:
+    case MVE_VADDf32:
+    case MVE_VADDi32:
+    case MVE_VBICimmi32:
+    case MVE_VBRSR32:
+    case MVE_VCADDf32:
+    case MVE_VCADDi32:
+    case MVE_VCLSs32:
+    case MVE_VCLZs32:
+    case MVE_VCMLAf32:
+    case MVE_VCMPf32:
+    case MVE_VCMPf32r:
+    case MVE_VCMPi32:
+    case MVE_VCMPi32r:
+    case MVE_VCMPs32:
+    case MVE_VCMPs32r:
+    case MVE_VCMPu32:
+    case MVE_VCMPu32r:
+    case MVE_VCMULf32:
+    case MVE_VCTP32:
+    case MVE_VCVTf16f32bh:
+    case MVE_VCVTf16f32th:
+    case MVE_VCVTf32f16bh:
+    case MVE_VCVTf32f16th:
+    case MVE_VCVTf32s32_fix:
+    case MVE_VCVTf32s32n:
+    case MVE_VCVTf32u32_fix:
+    case MVE_VCVTf32u32n:
+    case MVE_VCVTs32f32_fix:
+    case MVE_VCVTs32f32a:
+    case MVE_VCVTs32f32m:
+    case MVE_VCVTs32f32n:
+    case MVE_VCVTs32f32p:
+    case MVE_VCVTs32f32z:
+    case MVE_VCVTu32f32_fix:
+    case MVE_VCVTu32f32a:
+    case MVE_VCVTu32f32m:
+    case MVE_VCVTu32f32n:
+    case MVE_VCVTu32f32p:
+    case MVE_VCVTu32f32z:
+    case MVE_VDDUPu32:
+    case MVE_VDUP32:
+    case MVE_VDWDUPu32:
+    case MVE_VFMA_qr_Sf32:
+    case MVE_VFMA_qr_f32:
+    case MVE_VFMAf32:
+    case MVE_VFMSf32:
+    case MVE_VHADD_qr_s32:
+    case MVE_VHADD_qr_u32:
+    case MVE_VHADDs32:
+    case MVE_VHADDu32:
+    case MVE_VHCADDs32:
+    case MVE_VHSUB_qr_s32:
+    case MVE_VHSUB_qr_u32:
+    case MVE_VHSUBs32:
+    case MVE_VHSUBu32:
+    case MVE_VIDUPu32:
+    case MVE_VIWDUPu32:
+    case MVE_VLD20_32:
+    case MVE_VLD20_32_wb:
+    case MVE_VLD21_32:
+    case MVE_VLD21_32_wb:
+    case MVE_VLD40_32:
+    case MVE_VLD40_32_wb:
+    case MVE_VLD41_32:
+    case MVE_VLD41_32_wb:
+    case MVE_VLD42_32:
+    case MVE_VLD42_32_wb:
+    case MVE_VLD43_32:
+    case MVE_VLD43_32_wb:
+    case MVE_VLDRBS32:
+    case MVE_VLDRBS32_post:
+    case MVE_VLDRBS32_pre:
+    case MVE_VLDRBS32_rq:
+    case MVE_VLDRBU32:
+    case MVE_VLDRBU32_post:
+    case MVE_VLDRBU32_pre:
+    case MVE_VLDRBU32_rq:
+    case MVE_VLDRHS32:
+    case MVE_VLDRHS32_post:
+    case MVE_VLDRHS32_pre:
+    case MVE_VLDRHS32_rq:
+    case MVE_VLDRHS32_rq_u:
+    case MVE_VLDRHU32:
+    case MVE_VLDRHU32_post:
+    case MVE_VLDRHU32_pre:
+    case MVE_VLDRHU32_rq:
+    case MVE_VLDRHU32_rq_u:
+    case MVE_VLDRWU32:
+    case MVE_VLDRWU32_post:
+    case MVE_VLDRWU32_pre:
+    case MVE_VLDRWU32_qi:
+    case MVE_VLDRWU32_qi_pre:
+    case MVE_VLDRWU32_rq:
+    case MVE_VLDRWU32_rq_u:
+    case MVE_VMAXAVs32:
+    case MVE_VMAXAs32:
+    case MVE_VMAXNMAVf32:
+    case MVE_VMAXNMAf32:
+    case MVE_VMAXNMVf32:
+    case MVE_VMAXNMf32:
+    case MVE_VMAXVs32:
+    case MVE_VMAXVu32:
+    case MVE_VMAXs32:
+    case MVE_VMAXu32:
+    case MVE_VMINAVs32:
+    case MVE_VMINAs32:
+    case MVE_VMINNMAVf32:
+    case MVE_VMINNMAf32:
+    case MVE_VMINNMVf32:
+    case MVE_VMINNMf32:
+    case MVE_VMINVs32:
+    case MVE_VMINVu32:
+    case MVE_VMINs32:
+    case MVE_VMINu32:
+    case MVE_VMLADAVas32:
+    case MVE_VMLADAVau32:
+    case MVE_VMLADAVaxs32:
+    case MVE_VMLADAVs32:
+    case MVE_VMLADAVu32:
+    case MVE_VMLADAVxs32:
+    case MVE_VMLALDAVas32:
+    case MVE_VMLALDAVau32:
+    case MVE_VMLALDAVaxs32:
+    case MVE_VMLALDAVs32:
+    case MVE_VMLALDAVu32:
+    case MVE_VMLALDAVxs32:
+    case MVE_VMLAS_qr_s32:
+    case MVE_VMLAS_qr_u32:
+    case MVE_VMLA_qr_s32:
+    case MVE_VMLA_qr_u32:
+    case MVE_VMLSDAVas32:
+    case MVE_VMLSDAVaxs32:
+    case MVE_VMLSDAVs32:
+    case MVE_VMLSDAVxs32:
+    case MVE_VMLSLDAVas32:
+    case MVE_VMLSLDAVaxs32:
+    case MVE_VMLSLDAVs32:
+    case MVE_VMLSLDAVxs32:
+    case MVE_VMOVNi32bh:
+    case MVE_VMOVNi32th:
+    case MVE_VMOV_from_lane_32:
+    case MVE_VMOV_q_rr:
+    case MVE_VMOV_rr_q:
+    case MVE_VMOV_to_lane_32:
+    case MVE_VMOVimmf32:
+    case MVE_VMOVimmi32:
+    case MVE_VMOVLs16bh:
+    case MVE_VMOVLs16th:
+    case MVE_VMOVLu16bh:
+    case MVE_VMOVLu16th:
+    case MVE_VMULHs32:
+    case MVE_VMULHu32:
+    case MVE_VMULLBp16:
+    case MVE_VMULLBs16:
+    case MVE_VMULLBu16:
+    case MVE_VMULLTp16:
+    case MVE_VMULLTs16:
+    case MVE_VMULLTu16:
+    case MVE_VMUL_qr_f32:
+    case MVE_VMUL_qr_i32:
+    case MVE_VMULf32:
+    case MVE_VMULi32:
+    case MVE_VMVNimmi32:
+    case MVE_VNEGf32:
+    case MVE_VNEGs32:
+    case MVE_VORRimmi32:
+    case MVE_VPTv4f32:
+    case MVE_VPTv4f32r:
+    case MVE_VPTv4i32:
+    case MVE_VPTv4i32r:
+    case MVE_VPTv4s32:
+    case MVE_VPTv4s32r:
+    case MVE_VPTv4u32:
+    case MVE_VPTv4u32r:
+    case MVE_VQABSs32:
+    case MVE_VQADD_qr_s32:
+    case MVE_VQADD_qr_u32:
+    case MVE_VQADDs32:
+    case MVE_VQADDu32:
+    case MVE_VQDMLADHXs32:
+    case MVE_VQDMLADHs32:
+    case MVE_VQDMLAH_qrs32:
+    case MVE_VQDMLASH_qrs32:
+    case MVE_VQDMLSDHXs32:
+    case MVE_VQDMLSDHs32:
+    case MVE_VQDMULH_qr_s32:
+    case MVE_VQDMULHi32:
+    case MVE_VQDMULL_qr_s32bh:
+    case MVE_VQDMULL_qr_s32th:
+    case MVE_VQDMULLs32bh:
+    case MVE_VQDMULLs32th:
+    case MVE_VQMOVNs32bh:
+    case MVE_VQMOVNs32th:
+    case MVE_VQMOVNu32bh:
+    case MVE_VQMOVNu32th:
+    case MVE_VQMOVUNs32bh:
+    case MVE_VQMOVUNs32th:
+    case MVE_VQNEGs32:
+    case MVE_VQRDMLADHXs32:
+    case MVE_VQRDMLADHs32:
+    case MVE_VQRDMLAH_qrs32:
+    case MVE_VQRDMLASH_qrs32:
+    case MVE_VQRDMLSDHXs32:
+    case MVE_VQRDMLSDHs32:
+    case MVE_VQRDMULH_qr_s32:
+    case MVE_VQRDMULHi32:
+    case MVE_VQRSHL_by_vecs32:
+    case MVE_VQRSHL_by_vecu32:
+    case MVE_VQRSHL_qrs32:
+    case MVE_VQRSHL_qru32:
+    case MVE_VQRSHRNbhs32:
+    case MVE_VQRSHRNbhu32:
+    case MVE_VQRSHRNths32:
+    case MVE_VQRSHRNthu32:
+    case MVE_VQRSHRUNs32bh:
+    case MVE_VQRSHRUNs32th:
+    case MVE_VQSHLU_imms32:
+    case MVE_VQSHL_by_vecs32:
+    case MVE_VQSHL_by_vecu32:
+    case MVE_VQSHL_qrs32:
+    case MVE_VQSHL_qru32:
+    case MVE_VQSHLimms32:
+    case MVE_VQSHLimmu32:
+    case MVE_VQSHRNbhs32:
+    case MVE_VQSHRNbhu32:
+    case MVE_VQSHRNths32:
+    case MVE_VQSHRNthu32:
+    case MVE_VQSHRUNs32bh:
+    case MVE_VQSHRUNs32th:
+    case MVE_VQSUB_qr_s32:
+    case MVE_VQSUB_qr_u32:
+    case MVE_VQSUBs32:
+    case MVE_VQSUBu32:
+    case MVE_VREV32_16:
+    case MVE_VREV32_8:
+    case MVE_VRHADDs32:
+    case MVE_VRHADDu32:
+    case MVE_VRINTf32A:
+    case MVE_VRINTf32M:
+    case MVE_VRINTf32N:
+    case MVE_VRINTf32P:
+    case MVE_VRINTf32X:
+    case MVE_VRINTf32Z:
+    case MVE_VRMLALDAVHas32:
+    case MVE_VRMLALDAVHau32:
+    case MVE_VRMLALDAVHaxs32:
+    case MVE_VRMLALDAVHs32:
+    case MVE_VRMLALDAVHu32:
+    case MVE_VRMLALDAVHxs32:
+    case MVE_VRMLSLDAVHas32:
+    case MVE_VRMLSLDAVHaxs32:
+    case MVE_VRMLSLDAVHs32:
+    case MVE_VRMLSLDAVHxs32:
+    case MVE_VRMULHs32:
+    case MVE_VRMULHu32:
+    case MVE_VRSHL_by_vecs32:
+    case MVE_VRSHL_by_vecu32:
+    case MVE_VRSHL_qrs32:
+    case MVE_VRSHL_qru32:
+    case MVE_VRSHRNi32bh:
+    case MVE_VRSHRNi32th:
+    case MVE_VRSHR_imms32:
+    case MVE_VRSHR_immu32:
+    case MVE_VSBC:
+    case MVE_VSBCI:
+    case MVE_VSHLC:
+    case MVE_VSHLL_imms16bh:
+    case MVE_VSHLL_imms16th:
+    case MVE_VSHLL_immu16bh:
+    case MVE_VSHLL_immu16th:
+    case MVE_VSHLL_lws16bh:
+    case MVE_VSHLL_lws16th:
+    case MVE_VSHLL_lwu16bh:
+    case MVE_VSHLL_lwu16th:
+    case MVE_VSHL_by_vecs32:
+    case MVE_VSHL_by_vecu32:
+    case MVE_VSHL_immi32:
+    case MVE_VSHL_qrs32:
+    case MVE_VSHL_qru32:
+    case MVE_VSHRNi32bh:
+    case MVE_VSHRNi32th:
+    case MVE_VSHR_imms32:
+    case MVE_VSHR_immu32:
+    case MVE_VSLIimm32:
+    case MVE_VSRIimm32:
+    case MVE_VST20_32:
+    case MVE_VST20_32_wb:
+    case MVE_VST21_32:
+    case MVE_VST21_32_wb:
+    case MVE_VST40_32:
+    case MVE_VST40_32_wb:
+    case MVE_VST41_32:
+    case MVE_VST41_32_wb:
+    case MVE_VST42_32:
+    case MVE_VST42_32_wb:
+    case MVE_VST43_32:
+    case MVE_VST43_32_wb:
+    case MVE_VSTRB32:
+    case MVE_VSTRB32_post:
+    case MVE_VSTRB32_pre:
+    case MVE_VSTRB32_rq:
+    case MVE_VSTRH32:
+    case MVE_VSTRH32_post:
+    case MVE_VSTRH32_pre:
+    case MVE_VSTRH32_rq:
+    case MVE_VSTRH32_rq_u:
+    case MVE_VSTRW32_qi:
+    case MVE_VSTRW32_qi_pre:
+    case MVE_VSTRW32_rq:
+    case MVE_VSTRW32_rq_u:
+    case MVE_VSTRWU32:
+    case MVE_VSTRWU32_post:
+    case MVE_VSTRWU32_pre:
+    case MVE_VSUB_qr_f32:
+    case MVE_VSUB_qr_i32:
+    case MVE_VSUBf32:
+    case MVE_VSUBi32:
+      return 2;
+    case MVE_VCTP64:
+    case MVE_VLDRDU64_qi:
+    case MVE_VLDRDU64_qi_pre:
+    case MVE_VLDRDU64_rq:
+    case MVE_VLDRDU64_rq_u:
+    case MVE_VMULLBs32:
+    case MVE_VMULLBu32:
+    case MVE_VMULLTs32:
+    case MVE_VMULLTu32:
+    case MVE_VMOVimmi64:
+    case MVE_VREV64_16:
+    case MVE_VREV64_32:
+    case MVE_VREV64_8:
+    case MVE_VSTRD64_qi:
+    case MVE_VSTRD64_qi_pre:
+    case MVE_VSTRD64_rq:
+    case MVE_VSTRD64_rq_u:
+      return 3;
+    }
+  };
+  LLVMInitializeARMTargetInfo();
+  LLVMInitializeARMTarget();
+  LLVMInitializeARMTargetMC();
+
+  auto TT(Triple::normalize("thumbv8.1m.main-none-none-eabi"));
+  std::string Error;
+  const Target *T = TargetRegistry::lookupTarget(TT, Error);
+  if (!T) {
+    dbgs() << Error;
+    return;
+  }
+
+  TargetOptions Options;
+  auto TM = std::unique_ptr<LLVMTargetMachine>(
+    static_cast<LLVMTargetMachine*>(
+      T->createTargetMachine(TT, "generic", "", Options, None, None,
+                             CodeGenOpt::Default)));
+  ARMSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
+                  std::string(TM->getTargetFeatureString()),
+                  *static_cast<const ARMBaseTargetMachine *>(TM.get()), false);
+
+  auto MII = TM->getMCInstrInfo();
+  for (unsigned i = 0; i < ARM::INSTRUCTION_LIST_END; ++i) {
+    uint64_t Flags = MII->get(i).TSFlags;
+    if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE)
+      continue;
+    int Size = (Flags & ARMII::VecSize) >> ARMII::VecSizeShift;
+    ASSERT_EQ(MVEVecSize(i), Size)
+              << MII->getName(i)
+              << ": mismatched expectation for MVE vec size\n";
+  }
+}
\ No newline at end of file
-- 
2.7.4