[AArch64][SVE] Fix VLS mulh tests to use literal vectors

author David Truby <david.truby@arm.com>

Fri, 14 Jan 2022 14:20:44 +0000 (14:20 +0000)

committer David Truby <david.truby@arm.com>

Fri, 14 Jan 2022 14:21:36 +0000 (14:21 +0000)
author David Truby <david.truby@arm.com>
Fri, 14 Jan 2022 14:20:44 +0000 (14:20 +0000)
committer David Truby <david.truby@arm.com>
Fri, 14 Jan 2022 14:21:36 +0000 (14:21 +0000)
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll

index 0637922..42e1036 100644 (file)
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
@@ -1,11 +1,11 @@
  ; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_EQ_256
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK
  ; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
  ; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
  ; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
  ; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  ; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  ; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  ; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
@@ -61,7 +61,7 @@ define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
    %1 = sext <8 x i8> %op1 to <8 x i16>
    %2 = sext <8 x i8> %op2 to <8 x i16>
    %mul = mul <8 x i16> %1, %2
-  %shr = lshr <8 x i16> %mul, %splat
+  %shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    %res = trunc <8 x i16> %shr to <8 x i8>
    ret <8 x i8> %res
  }
@@ -74,69 +74,43 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
  ; CHECK-NEXT:    smull v0.8h, v0.8b, v1.8b
  ; CHECK-NEXT:    uzp2 v0.16b, v0.16b, v2.16b
  ; CHECK-NEXT:    ret
-  %insert = insertelement <16 x i16> undef, i16 8, i64 0
-  %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
    %1 = sext <16 x i8> %op1 to <16 x i16>
    %2 = sext <16 x i8> %op2 to <16 x i16>
    %mul = mul <16 x i16> %1, %2
-  %shr = lshr <16 x i16> %mul, %splat
+  %shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    %res = trunc <16 x i16> %shr to <16 x i8>
    ret <16 x i8> %res
  }
  
  define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
-; VBITS_EQ_256-LABEL: smulh_v32i8:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    ptrue p0.b, vl32
-; VBITS_EQ_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_EQ_256-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_EQ_256-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
-; VBITS_EQ_256-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_EQ_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: smulh_v32i8:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_512-NEXT:    ld1sb { z0.h }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1sb { z1.h }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    mul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_512-NEXT:    lsr z0.h, p0/m, z0.h, #8
-; VBITS_GE_512-NEXT:    st1b { z0.h }, p0, [x0]
-; VBITS_GE_512-NEXT:    ret
-
+; VBITS_GE_256-LABEL: smulh_v32i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
    %op1 = load <32 x i8>, <32 x i8>* %a
    %op2 = load <32 x i8>, <32 x i8>* %b
-  %insert = insertelement <32 x i16> undef, i16 8, i64 0
-  %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
    %1 = sext <32 x i8> %op1 to <32 x i16>
    %2 = sext <32 x i8> %op2 to <32 x i16>
    %mul = mul <32 x i16> %1, %2
-  %shr = lshr <32 x i16> %mul, %splat
+  %shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    %res = trunc <32 x i16> %shr to <32 x i8>
    store <32 x i8> %res, <32 x i8>* %a
    ret void
  }
  
  define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; VBITS_EQ_512-LABEL: smulh_v64i8:
-; VBITS_EQ_512:       // %bb.0:
-; VBITS_EQ_512-NEXT:    ptrue p0.b, vl64
-; VBITS_EQ_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_EQ_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_EQ_512-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
-; VBITS_EQ_512-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_EQ_512-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: smulh_v64i8:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    ld1sb { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1sb { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    mul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_1024-NEXT:    lsr z0.h, p0/m, z0.h, #8
-; VBITS_GE_1024-NEXT:    st1b { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
-
+; VBITS_GE_512-LABEL: smulh_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
    %op1 = load <64 x i8>, <64 x i8>* %a
    %op2 = load <64 x i8>, <64 x i8>* %b
    %insert = insertelement <64 x i16> undef, i16 8, i64 0
@@ -144,40 +118,28 @@ define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
    %1 = sext <64 x i8> %op1 to <64 x i16>
    %2 = sext <64 x i8> %op2 to <64 x i16>
    %mul = mul <64 x i16> %1, %2
-  %shr = lshr <64 x i16> %mul, %splat
+  %shr = lshr <64 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    %res = trunc <64 x i16> %shr to <64 x i8>
    store <64 x i8> %res, <64 x i8>* %a
    ret void
  }
  
  define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
-; VBITS_EQ_1024-LABEL: smulh_v128i8:
-; VBITS_EQ_1024:       // %bb.0:
-; VBITS_EQ_1024-NEXT:    ptrue p0.b, vl128
-; VBITS_EQ_1024-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_EQ_1024-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_EQ_1024-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
-; VBITS_EQ_1024-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_EQ_1024-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: smulh_v128i8:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    ld1sb { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1sb { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    mul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_2048-NEXT:    lsr z0.h, p0/m, z0.h, #8
-; VBITS_GE_2048-NEXT:    st1b { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+; VBITS_GE_1024-LABEL: smulh_v128i8:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.b, vl128
+; VBITS_GE_1024-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_1024-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_1024-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_1024-NEXT:    ret
  
    %op1 = load <128 x i8>, <128 x i8>* %a
    %op2 = load <128 x i8>, <128 x i8>* %b
-  %insert = insertelement <128 x i16> undef, i16 8, i64 0
-  %splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer
    %1 = sext <128 x i8> %op1 to <128 x i16>
    %2 = sext <128 x i8> %op2 to <128 x i16>
    %mul = mul <128 x i16> %1, %2
-  %shr = lshr <128 x i16> %mul, %splat
+  %shr = lshr <128 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    %res = trunc <128 x i16> %shr to <128 x i8>
    store <128 x i8> %res, <128 x i8>* %a
    ret void
@@ -194,12 +156,10 @@ define void @smulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
  ; VBITS_GE_2048-NEXT:    ret
    %op1 = load <256 x i8>, <256 x i8>* %a
    %op2 = load <256 x i8>, <256 x i8>* %b
-  %insert = insertelement <256 x i16> undef, i16 8, i64 0
-  %splat = shufflevector <256 x i16> %insert, <256 x i16> undef, <256 x i32> zeroinitializer
    %1 = sext <256 x i8> %op1 to <256 x i16>
    %2 = sext <256 x i8> %op2 to <256 x i16>
    %mul = mul <256 x i16> %1, %2
-  %shr = lshr <256 x i16> %mul, %splat
+  %shr = lshr <256 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    %res = trunc <256 x i16> %shr to <256 x i8>
    store <256 x i8> %res, <256 x i8>* %a
    ret void
@@ -220,12 +180,10 @@ define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
  ; CHECK-NEXT:    mov v0.h[3], w8
  ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
  ; CHECK-NEXT:    ret
-  %insert = insertelement <4 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
    %1 = sext <4 x i16> %op1 to <4 x i32>
    %2 = sext <4 x i16> %op2 to <4 x i32>
    %mul = mul <4 x i32> %1, %2
-  %shr = lshr <4 x i32> %mul, %splat
+  %shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16>
    %res = trunc <4 x i32> %shr to <4 x i16>
    ret <4 x i16> %res
  }
@@ -238,110 +196,69 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
  ; CHECK-NEXT:    smull v0.4s, v0.4h, v1.4h
  ; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
  ; CHECK-NEXT:    ret
-  %insert = insertelement <8 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
    %1 = sext <8 x i16> %op1 to <8 x i32>
    %2 = sext <8 x i16> %op2 to <8 x i32>
    %mul = mul <8 x i32> %1, %2
-  %shr = lshr <8 x i32> %mul, %splat
+  %shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    %res = trunc <8 x i32> %shr to <8 x i16>
    ret <8 x i16> %res
  }
  
  define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
-; VBITS_EQ_256-LABEL: smulh_v16i16:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    ptrue p0.h, vl16
-; VBITS_EQ_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_EQ_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_EQ_256-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_EQ_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: smulh_v16i16:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    ld1sh { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1sh { z1.s }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    mul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_512-NEXT:    lsr z0.s, p0/m, z0.s, #16
-; VBITS_GE_512-NEXT:    st1h { z0.s }, p0, [x0]
-; VBITS_GE_512-NEXT:    ret
-
+; VBITS_GE_256-LABEL: smulh_v16i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
    %op1 = load <16 x i16>, <16 x i16>* %a
    %op2 = load <16 x i16>, <16 x i16>* %b
-  %insert = insertelement <16 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
    %1 = sext <16 x i16> %op1 to <16 x i32>
    %2 = sext <16 x i16> %op2 to <16 x i32>
    %mul = mul <16 x i32> %1, %2
-  %shr = lshr <16 x i32> %mul, %splat
+  %shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    %res = trunc <16 x i32> %shr to <16 x i16>
    store <16 x i16> %res, <16 x i16>* %a
    ret void
  }
  
  define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; VBITS_EQ_512-LABEL: smulh_v32i16:
-; VBITS_EQ_512:       // %bb.0:
-; VBITS_EQ_512-NEXT:    ptrue p0.h, vl32
-; VBITS_EQ_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_EQ_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_EQ_512-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_512-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_EQ_512-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: smulh_v32i16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1sh { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1sh { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    mul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_1024-NEXT:    lsr z0.s, p0/m, z0.s, #16
-; VBITS_GE_1024-NEXT:    st1h { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
-
+; VBITS_GE_512-LABEL: smulh_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
    %op1 = load <32 x i16>, <32 x i16>* %a
    %op2 = load <32 x i16>, <32 x i16>* %b
-  %insert = insertelement <32 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer
    %1 = sext <32 x i16> %op1 to <32 x i32>
    %2 = sext <32 x i16> %op2 to <32 x i32>
    %mul = mul <32 x i32> %1, %2
-  %shr = lshr <32 x i32> %mul, %splat
+  %shr = lshr <32 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    %res = trunc <32 x i32> %shr to <32 x i16>
    store <32 x i16> %res, <32 x i16>* %a
    ret void
  }
  
  define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
-; VBITS_EQ_1024-LABEL: smulh_v64i16:
-; VBITS_EQ_1024:       // %bb.0:
-; VBITS_EQ_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_EQ_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_EQ_1024-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_EQ_1024-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_EQ_1024-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: smulh_v64i16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1sh { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1sh { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    mul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_2048-NEXT:    lsr z0.s, p0/m, z0.s, #16
-; VBITS_GE_2048-NEXT:    st1h { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
-
+; VBITS_GE_1024-LABEL: smulh_v64i16:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
+; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_1024-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_1024-NEXT:    ret
    %op1 = load <64 x i16>, <64 x i16>* %a
    %op2 = load <64 x i16>, <64 x i16>* %b
-  %insert = insertelement <64 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer
    %1 = sext <64 x i16> %op1 to <64 x i32>
    %2 = sext <64 x i16> %op2 to <64 x i32>
    %mul = mul <64 x i32> %1, %2
-  %shr = lshr <64 x i32> %mul, %splat
+  %shr = lshr <64 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    %res = trunc <64 x i32> %shr to <64 x i16>
    store <64 x i16> %res, <64 x i16>* %a
    ret void
@@ -358,12 +275,10 @@ define void @smulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
  ; VBITS_GE_2048-NEXT:    ret
    %op1 = load <128 x i16>, <128 x i16>* %a
    %op2 = load <128 x i16>, <128 x i16>* %b
-  %insert = insertelement <128 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <128 x i32> %insert, <128 x i32> undef, <128 x i32> zeroinitializer
    %1 = sext <128 x i16> %op1 to <128 x i32>
    %2 = sext <128 x i16> %op2 to <128 x i32>
    %mul = mul <128 x i32> %1, %2
-  %shr = lshr <128 x i32> %mul, %splat
+  %shr = lshr <128 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    %res = trunc <128 x i32> %shr to <128 x i16>
    store <128 x i16> %res, <128 x i16>* %a
    ret void
@@ -379,12 +294,10 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
  ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
  ; CHECK-NEXT:    shrn v0.2s, v0.2d, #32
  ; CHECK-NEXT:    ret
-  %insert = insertelement <2 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
    %1 = sext <2 x i32> %op1 to <2 x i64>
    %2 = sext <2 x i32> %op2 to <2 x i64>
    %mul = mul <2 x i64> %1, %2
-  %shr = lshr <2 x i64> %mul, %splat
+  %shr = lshr <2 x i64> %mul, <i64 32, i64 32>
    %res = trunc <2 x i64> %shr to <2 x i32>
    ret <2 x i32> %res
  }
@@ -397,110 +310,69 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
  ; CHECK-NEXT:    smull v0.2d, v0.2s, v1.2s
  ; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v2.4s
  ; CHECK-NEXT:    ret
-  %insert = insertelement <4 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
    %1 = sext <4 x i32> %op1 to <4 x i64>
    %2 = sext <4 x i32> %op2 to <4 x i64>
    %mul = mul <4 x i64> %1, %2
-  %shr = lshr <4 x i64> %mul, %splat
+  %shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32>
    %res = trunc <4 x i64> %shr to <4 x i32>
    ret <4 x i32> %res
  }
  
  define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
-; VBITS_EQ_256-LABEL: smulh_v8i32:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
-; VBITS_EQ_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_EQ_256-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_EQ_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: smulh_v8i32:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1sw { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1sw { z1.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_512-NEXT:    lsr z0.d, p0/m, z0.d, #32
-; VBITS_GE_512-NEXT:    st1w { z0.d }, p0, [x0]
-; VBITS_GE_512-NEXT:    ret
-
+; VBITS_GE_256-LABEL: smulh_v8i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
    %op1 = load <8 x i32>, <8 x i32>* %a
    %op2 = load <8 x i32>, <8 x i32>* %b
-  %insert = insertelement <8 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
    %1 = sext <8 x i32> %op1 to <8 x i64>
    %2 = sext <8 x i32> %op2 to <8 x i64>
    %mul = mul <8 x i64> %1, %2
-  %shr = lshr <8 x i64> %mul, %splat
+  %shr = lshr <8 x i64> %mul,  <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
    %res = trunc <8 x i64> %shr to <8 x i32>
    store <8 x i32> %res, <8 x i32>* %a
    ret void
  }
  
  define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; VBITS_EQ_512-LABEL: smulh_v16i32:
-; VBITS_EQ_512:       // %bb.0:
-; VBITS_EQ_512-NEXT:    ptrue p0.s, vl16
-; VBITS_EQ_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_EQ_512-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_512-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_EQ_512-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: smulh_v16i32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1sw { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1sw { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_1024-NEXT:    lsr z0.d, p0/m, z0.d, #32
-; VBITS_GE_1024-NEXT:    st1w { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
-
+; VBITS_GE_512-LABEL: smulh_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
    %op1 = load <16 x i32>, <16 x i32>* %a
    %op2 = load <16 x i32>, <16 x i32>* %b
-  %insert = insertelement <16 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer
    %1 = sext <16 x i32> %op1 to <16 x i64>
    %2 = sext <16 x i32> %op2 to <16 x i64>
    %mul = mul <16 x i64> %1, %2
-  %shr = lshr <16 x i64> %mul, %splat
+  %shr = lshr <16 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
    %res = trunc <16 x i64> %shr to <16 x i32>
    store <16 x i32> %res, <16 x i32>* %a
    ret void
  }
  
  define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
-; VBITS_EQ_1024-LABEL: smulh_v32i32:
-; VBITS_EQ_1024:       // %bb.0:
-; VBITS_EQ_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_EQ_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_EQ_1024-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_1024-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_EQ_1024-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: smulh_v32i32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1sw { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1sw { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_2048-NEXT:    lsr z0.d, p0/m, z0.d, #32
-; VBITS_GE_2048-NEXT:    st1w { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
-
+; VBITS_GE_1024-LABEL: smulh_v32i32:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_1024-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_1024-NEXT:    ret
    %op1 = load <32 x i32>, <32 x i32>* %a
    %op2 = load <32 x i32>, <32 x i32>* %b
-  %insert = insertelement <32 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer
    %1 = sext <32 x i32> %op1 to <32 x i64>
    %2 = sext <32 x i32> %op2 to <32 x i64>
    %mul = mul <32 x i64> %1, %2
-  %shr = lshr <32 x i64> %mul, %splat
+  %shr = lshr <32 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
    %res = trunc <32 x i64> %shr to <32 x i32>
    store <32 x i32> %res, <32 x i32>* %a
    ret void
@@ -517,12 +389,10 @@ define void @smulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
  ; VBITS_GE_2048-NEXT:    ret
    %op1 = load <64 x i32>, <64 x i32>* %a
    %op2 = load <64 x i32>, <64 x i32>* %b
-  %insert = insertelement <64 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <64 x i64> %insert, <64 x i64> undef, <64 x i32> zeroinitializer
    %1 = sext <64 x i32> %op1 to <64 x i64>
    %2 = sext <64 x i32> %op2 to <64 x i64>
    %mul = mul <64 x i64> %1, %2
-  %shr = lshr <64 x i64> %mul, %splat
+  %shr = lshr <64 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
    %res = trunc <64 x i64> %shr to <64 x i32>
    store <64 x i32> %res, <64 x i32>* %a
    ret void
@@ -558,12 +428,10 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
  ; CHECK-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
  ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
  ; CHECK-NEXT:    ret
-  %insert = insertelement <2 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <2 x i128> %insert, <2 x i128> undef, <2 x i32> zeroinitializer
    %1 = sext <2 x i64> %op1 to <2 x i128>
    %2 = sext <2 x i64> %op2 to <2 x i128>
    %mul = mul <2 x i128> %1, %2
-  %shr = lshr <2 x i128> %mul, %splat
+  %shr = lshr <2 x i128> %mul, <i128 64, i128 64>
    %res = trunc <2 x i128> %shr to <2 x i64>
    ret <2 x i64> %res
  }
@@ -579,12 +447,10 @@ define void @smulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
  ; CHECK-NEXT:    ret
    %op1 = load <4 x i64>, <4 x i64>* %a
    %op2 = load <4 x i64>, <4 x i64>* %b
-  %insert = insertelement <4 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <4 x i128> %insert, <4 x i128> undef, <4 x i32> zeroinitializer
    %1 = sext <4 x i64> %op1 to <4 x i128>
    %2 = sext <4 x i64> %op2 to <4 x i128>
    %mul = mul <4 x i128> %1, %2
-  %shr = lshr <4 x i128> %mul, %splat
+  %shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64>
    %res = trunc <4 x i128> %shr to <4 x i64>
    store <4 x i64> %res, <4 x i64>* %a
    ret void
@@ -601,12 +467,10 @@ define void @smulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
  ; VBITS_GE_512-NEXT:    ret
    %op1 = load <8 x i64>, <8 x i64>* %a
    %op2 = load <8 x i64>, <8 x i64>* %b
-  %insert = insertelement <8 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <8 x i128> %insert, <8 x i128> undef, <8 x i32> zeroinitializer
    %1 = sext <8 x i64> %op1 to <8 x i128>
    %2 = sext <8 x i64> %op2 to <8 x i128>
    %mul = mul <8 x i128> %1, %2
-  %shr = lshr <8 x i128> %mul, %splat
+  %shr = lshr <8 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
    %res = trunc <8 x i128> %shr to <8 x i64>
    store <8 x i64> %res, <8 x i64>* %a
    ret void
@@ -623,12 +487,10 @@ define void @smulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
  ; VBITS_GE_1024-NEXT:    ret
    %op1 = load <16 x i64>, <16 x i64>* %a
    %op2 = load <16 x i64>, <16 x i64>* %b
-  %insert = insertelement <16 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <16 x i128> %insert, <16 x i128> undef, <16 x i32> zeroinitializer
    %1 = sext <16 x i64> %op1 to <16 x i128>
    %2 = sext <16 x i64> %op2 to <16 x i128>
    %mul = mul <16 x i128> %1, %2
-  %shr = lshr <16 x i128> %mul, %splat
+  %shr = lshr <16 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
    %res = trunc <16 x i128> %shr to <16 x i64>
    store <16 x i64> %res, <16 x i64>* %a
    ret void
@@ -645,12 +507,10 @@ define void @smulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
  ; VBITS_GE_2048-NEXT:    ret
    %op1 = load <32 x i64>, <32 x i64>* %a
    %op2 = load <32 x i64>, <32 x i64>* %b
-  %insert = insertelement <32 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <32 x i128> %insert, <32 x i128> undef, <32 x i32> zeroinitializer
    %1 = sext <32 x i64> %op1 to <32 x i128>
    %2 = sext <32 x i64> %op2 to <32 x i128>
    %mul = mul <32 x i128> %1, %2
-  %shr = lshr <32 x i128> %mul, %splat
+  %shr = lshr <32 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
    %res = trunc <32 x i128> %shr to <32 x i64>
    store <32 x i64> %res, <32 x i64>* %a
    ret void
@@ -684,12 +544,10 @@ define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
  ; CHECK-NEXT:    mov v0.b[7], w8
  ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
  ; CHECK-NEXT:    ret
-  %insert = insertelement <8 x i16> undef, i16 8, i64 0
-  %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
    %1 = zext <8 x i8> %op1 to <8 x i16>
    %2 = zext <8 x i8> %op2 to <8 x i16>
    %mul = mul <8 x i16> %1, %2
-  %shr = lshr <8 x i16> %mul, %splat
+  %shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    %res = trunc <8 x i16> %shr to <8 x i8>
    ret <8 x i8> %res
  }
@@ -702,101 +560,63 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
  ; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
  ; CHECK-NEXT:    uzp2 v0.16b, v0.16b, v2.16b
  ; CHECK-NEXT:    ret
-  %insert = insertelement <16 x i16> undef, i16 8, i64 0
-  %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
    %1 = zext <16 x i8> %op1 to <16 x i16>
    %2 = zext <16 x i8> %op2 to <16 x i16>
    %mul = mul <16 x i16> %1, %2
-  %shr = lshr <16 x i16> %mul, %splat
+  %shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    %res = trunc <16 x i16> %shr to <16 x i8>
    ret <16 x i8> %res
  }
  
  define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
-; VBITS_EQ_256-LABEL: umulh_v32i8:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    ptrue p0.b, vl32
-; VBITS_EQ_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_EQ_256-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_EQ_256-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
-; VBITS_EQ_256-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_EQ_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: umulh_v32i8:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_512-NEXT:    ld1b { z0.h }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1b { z1.h }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    mul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_512-NEXT:    lsr z0.h, p0/m, z0.h, #8
-; VBITS_GE_512-NEXT:    st1b { z0.h }, p0, [x0]
-; VBITS_GE_512-NEXT:    ret
-
+; VBITS_GE_256-LABEL: umulh_v32i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
    %op1 = load <32 x i8>, <32 x i8>* %a
    %op2 = load <32 x i8>, <32 x i8>* %b
-  %insert = insertelement <32 x i16> undef, i16 8, i64 0
-  %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
    %1 = zext <32 x i8> %op1 to <32 x i16>
    %2 = zext <32 x i8> %op2 to <32 x i16>
    %mul = mul <32 x i16> %1, %2
-  %shr = lshr <32 x i16> %mul, %splat
+  %shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    %res = trunc <32 x i16> %shr to <32 x i8>
    store <32 x i8> %res, <32 x i8>* %a
    ret void
  }
  
  define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; VBITS_EQ_512-LABEL: umulh_v64i8:
-; VBITS_EQ_512:       // %bb.0:
-; VBITS_EQ_512-NEXT:    ptrue p0.b, vl64
-; VBITS_EQ_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_EQ_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_EQ_512-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
-; VBITS_EQ_512-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_EQ_512-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: umulh_v64i8:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    ld1b { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1b { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    mul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_1024-NEXT:    lsr z0.h, p0/m, z0.h, #8
-; VBITS_GE_1024-NEXT:    st1b { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
-
+; VBITS_GE_512-LABEL: umulh_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
    %op1 = load <64 x i8>, <64 x i8>* %a
    %op2 = load <64 x i8>, <64 x i8>* %b
-  %insert = insertelement <64 x i16> undef, i16 8, i64 0
-  %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer
    %1 = zext <64 x i8> %op1 to <64 x i16>
    %2 = zext <64 x i8> %op2 to <64 x i16>
    %mul = mul <64 x i16> %1, %2
-  %shr = lshr <64 x i16> %mul, %splat
+  %shr = lshr <64 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    %res = trunc <64 x i16> %shr to <64 x i8>
    store <64 x i8> %res, <64 x i8>* %a
    ret void
  }
  
  define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
-; VBITS_EQ_1024-LABEL: umulh_v128i8:
-; VBITS_EQ_1024:       // %bb.0:
-; VBITS_EQ_1024-NEXT:    ptrue p0.b, vl128
-; VBITS_EQ_1024-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_EQ_1024-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_EQ_1024-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
-; VBITS_EQ_1024-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_EQ_1024-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: umulh_v128i8:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    ld1b { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1b { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    mul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_2048-NEXT:    lsr z0.h, p0/m, z0.h, #8
-; VBITS_GE_2048-NEXT:    st1b { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+; VBITS_GE_1024-LABEL: umulh_v128i8:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.b, vl128
+; VBITS_GE_1024-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_1024-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_1024-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_1024-NEXT:    ret
  
    %op1 = load <128 x i8>, <128 x i8>* %a
    %op2 = load <128 x i8>, <128 x i8>* %b
@@ -805,7 +625,7 @@ define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
    %1 = zext <128 x i8> %op1 to <128 x i16>
    %2 = zext <128 x i8> %op2 to <128 x i16>
    %mul = mul <128 x i16> %1, %2
-  %shr = lshr <128 x i16> %mul, %splat
+  %shr = lshr <128 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    %res = trunc <128 x i16> %shr to <128 x i8>
    store <128 x i8> %res, <128 x i8>* %a
    ret void
@@ -822,12 +642,10 @@ define void @umulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
  ; VBITS_GE_2048-NEXT:    ret
    %op1 = load <256 x i8>, <256 x i8>* %a
    %op2 = load <256 x i8>, <256 x i8>* %b
-  %insert = insertelement <256 x i16> undef, i16 8, i64 0
-  %splat = shufflevector <256 x i16> %insert, <256 x i16> undef, <256 x i32> zeroinitializer
    %1 = zext <256 x i8> %op1 to <256 x i16>
    %2 = zext <256 x i8> %op2 to <256 x i16>
    %mul = mul <256 x i16> %1, %2
-  %shr = lshr <256 x i16> %mul, %splat
+  %shr = lshr <256 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    %res = trunc <256 x i16> %shr to <256 x i8>
    store <256 x i8> %res, <256 x i8>* %a
    ret void
@@ -848,12 +666,10 @@ define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
  ; CHECK-NEXT:    mov v0.h[3], w8
  ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
  ; CHECK-NEXT:    ret
-  %insert = insertelement <4 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
    %1 = zext <4 x i16> %op1 to <4 x i32>
    %2 = zext <4 x i16> %op2 to <4 x i32>
    %mul = mul <4 x i32> %1, %2
-  %shr = lshr <4 x i32> %mul, %splat
+  %shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16>
    %res = trunc <4 x i32> %shr to <4 x i16>
    ret <4 x i16> %res
  }
@@ -866,110 +682,69 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
  ; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
  ; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
  ; CHECK-NEXT:    ret
-  %insert = insertelement <8 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
    %1 = zext <8 x i16> %op1 to <8 x i32>
    %2 = zext <8 x i16> %op2 to <8 x i32>
    %mul = mul <8 x i32> %1, %2
-  %shr = lshr <8 x i32> %mul, %splat
+  %shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    %res = trunc <8 x i32> %shr to <8 x i16>
    ret <8 x i16> %res
  }
  
  define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
-; VBITS_EQ_256-LABEL: umulh_v16i16:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    ptrue p0.h, vl16
-; VBITS_EQ_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_EQ_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_EQ_256-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_EQ_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: umulh_v16i16:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    ld1h { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1h { z1.s }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    mul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_512-NEXT:    lsr z0.s, p0/m, z0.s, #16
-; VBITS_GE_512-NEXT:    st1h { z0.s }, p0, [x0]
-; VBITS_GE_512-NEXT:    ret
-
+; VBITS_GE_256-LABEL: umulh_v16i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
    %op1 = load <16 x i16>, <16 x i16>* %a
    %op2 = load <16 x i16>, <16 x i16>* %b
-  %insert = insertelement <16 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
    %1 = zext <16 x i16> %op1 to <16 x i32>
    %2 = zext <16 x i16> %op2 to <16 x i32>
    %mul = mul <16 x i32> %1, %2
-  %shr = lshr <16 x i32> %mul, %splat
+  %shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    %res = trunc <16 x i32> %shr to <16 x i16>
    store <16 x i16> %res, <16 x i16>* %a
    ret void
  }
  
  define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; VBITS_EQ_512-LABEL: umulh_v32i16:
-; VBITS_EQ_512:       // %bb.0:
-; VBITS_EQ_512-NEXT:    ptrue p0.h, vl32
-; VBITS_EQ_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_EQ_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_EQ_512-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_512-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_EQ_512-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: umulh_v32i16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1h { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1h { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    mul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_1024-NEXT:    lsr z0.s, p0/m, z0.s, #16
-; VBITS_GE_1024-NEXT:    st1h { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
-
+; VBITS_GE_512-LABEL: umulh_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
    %op1 = load <32 x i16>, <32 x i16>* %a
    %op2 = load <32 x i16>, <32 x i16>* %b
-  %insert = insertelement <32 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer
    %1 = zext <32 x i16> %op1 to <32 x i32>
    %2 = zext <32 x i16> %op2 to <32 x i32>
    %mul = mul <32 x i32> %1, %2
-  %shr = lshr <32 x i32> %mul, %splat
+  %shr = lshr <32 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    %res = trunc <32 x i32> %shr to <32 x i16>
    store <32 x i16> %res, <32 x i16>* %a
    ret void
  }
  
  define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
-; VBITS_EQ_1024-LABEL: umulh_v64i16:
-; VBITS_EQ_1024:       // %bb.0:
-; VBITS_EQ_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_EQ_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_EQ_1024-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_EQ_1024-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_EQ_1024-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: umulh_v64i16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1h { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1h { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    mul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_2048-NEXT:    lsr z0.s, p0/m, z0.s, #16
-; VBITS_GE_2048-NEXT:    st1h { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
-
+; VBITS_GE_1024-LABEL: umulh_v64i16:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
+; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_1024-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_1024-NEXT:    ret
    %op1 = load <64 x i16>, <64 x i16>* %a
    %op2 = load <64 x i16>, <64 x i16>* %b
-  %insert = insertelement <64 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer
    %1 = zext <64 x i16> %op1 to <64 x i32>
    %2 = zext <64 x i16> %op2 to <64 x i32>
    %mul = mul <64 x i32> %1, %2
-  %shr = lshr <64 x i32> %mul, %splat
+  %shr = lshr <64 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    %res = trunc <64 x i32> %shr to <64 x i16>
    store <64 x i16> %res, <64 x i16>* %a
    ret void
@@ -986,12 +761,10 @@ define void @umulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
  ; VBITS_GE_2048-NEXT:    ret
    %op1 = load <128 x i16>, <128 x i16>* %a
    %op2 = load <128 x i16>, <128 x i16>* %b
-  %insert = insertelement <128 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <128 x i32> %insert, <128 x i32> undef, <128 x i32> zeroinitializer
    %1 = zext <128 x i16> %op1 to <128 x i32>
    %2 = zext <128 x i16> %op2 to <128 x i32>
    %mul = mul <128 x i32> %1, %2
-  %shr = lshr <128 x i32> %mul, %splat
+  %shr = lshr <128 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    %res = trunc <128 x i32> %shr to <128 x i16>
    store <128 x i16> %res, <128 x i16>* %a
    ret void
@@ -1007,12 +780,10 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
  ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
  ; CHECK-NEXT:    shrn v0.2s, v0.2d, #32
  ; CHECK-NEXT:    ret
-  %insert = insertelement <2 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
    %1 = zext <2 x i32> %op1 to <2 x i64>
    %2 = zext <2 x i32> %op2 to <2 x i64>
    %mul = mul <2 x i64> %1, %2
-  %shr = lshr <2 x i64> %mul, %splat
+  %shr = lshr <2 x i64> %mul, <i64 32, i64 32>
    %res = trunc <2 x i64> %shr to <2 x i32>
    ret <2 x i32> %res
  }
@@ -1025,36 +796,23 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
  ; CHECK-NEXT:    umull v0.2d, v0.2s, v1.2s
  ; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v2.4s
  ; CHECK-NEXT:    ret
-  %insert = insertelement <4 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
    %1 = zext <4 x i32> %op1 to <4 x i64>
    %2 = zext <4 x i32> %op2 to <4 x i64>
    %mul = mul <4 x i64> %1, %2
-  %shr = lshr <4 x i64> %mul, %splat
+  %shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32>
    %res = trunc <4 x i64> %shr to <4 x i32>
    ret <4 x i32> %res
  }
  
  define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
-; VBITS_EQ_256-LABEL: umulh_v8i32:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
-; VBITS_EQ_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_EQ_256-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_EQ_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: umulh_v8i32:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1w { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1w { z1.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_512-NEXT:    lsr z0.d, p0/m, z0.d, #32
-; VBITS_GE_512-NEXT:    st1w { z0.d }, p0, [x0]
-; VBITS_GE_512-NEXT:    ret
-
+; VBITS_GE_256-LABEL: umulh_v8i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
    %op1 = load <8 x i32>, <8 x i32>* %a
    %op2 = load <8 x i32>, <8 x i32>* %b
    %insert = insertelement <8 x i64> undef, i64 32, i64 0
@@ -1062,73 +820,47 @@ define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
    %1 = zext <8 x i32> %op1 to <8 x i64>
    %2 = zext <8 x i32> %op2 to <8 x i64>
    %mul = mul <8 x i64> %1, %2
-  %shr = lshr <8 x i64> %mul, %splat
+  %shr = lshr <8 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
    %res = trunc <8 x i64> %shr to <8 x i32>
    store <8 x i32> %res, <8 x i32>* %a
    ret void
  }
  
  define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; VBITS_EQ_512-LABEL: umulh_v16i32:
-; VBITS_EQ_512:       // %bb.0:
-; VBITS_EQ_512-NEXT:    ptrue p0.s, vl16
-; VBITS_EQ_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_EQ_512-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_512-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_EQ_512-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: umulh_v16i32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1w { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1w { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_1024-NEXT:    lsr z0.d, p0/m, z0.d, #32
-; VBITS_GE_1024-NEXT:    st1w { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
-
+; VBITS_GE_512-LABEL: umulh_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
    %op1 = load <16 x i32>, <16 x i32>* %a
    %op2 = load <16 x i32>, <16 x i32>* %b
-  %insert = insertelement <16 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer
    %1 = zext <16 x i32> %op1 to <16 x i64>
    %2 = zext <16 x i32> %op2 to <16 x i64>
    %mul = mul <16 x i64> %1, %2
-  %shr = lshr <16 x i64> %mul, %splat
+  %shr = lshr <16 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
    %res = trunc <16 x i64> %shr to <16 x i32>
    store <16 x i32> %res, <16 x i32>* %a
    ret void
  }
  
  define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
-; VBITS_EQ_1024-LABEL: umulh_v32i32:
-; VBITS_EQ_1024:       // %bb.0:
-; VBITS_EQ_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_EQ_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_EQ_1024-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_1024-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_EQ_1024-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: umulh_v32i32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_2048-NEXT:    lsr z0.d, p0/m, z0.d, #32
-; VBITS_GE_2048-NEXT:    st1w { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
-
+; VBITS_GE_1024-LABEL: umulh_v32i32:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_1024-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_1024-NEXT:    ret
    %op1 = load <32 x i32>, <32 x i32>* %a
    %op2 = load <32 x i32>, <32 x i32>* %b
-  %insert = insertelement <32 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer
    %1 = zext <32 x i32> %op1 to <32 x i64>
    %2 = zext <32 x i32> %op2 to <32 x i64>
    %mul = mul <32 x i64> %1, %2
-  %shr = lshr <32 x i64> %mul, %splat
+  %shr = lshr <32 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
    %res = trunc <32 x i64> %shr to <32 x i32>
    store <32 x i32> %res, <32 x i32>* %a
    ret void
@@ -1145,12 +877,10 @@ define void @umulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
  ; VBITS_GE_2048-NEXT:    ret
    %op1 = load <64 x i32>, <64 x i32>* %a
    %op2 = load <64 x i32>, <64 x i32>* %b
-  %insert = insertelement <64 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <64 x i64> %insert, <64 x i64> undef, <64 x i32> zeroinitializer
    %1 = zext <64 x i32> %op1 to <64 x i64>
    %2 = zext <64 x i32> %op2 to <64 x i64>
    %mul = mul <64 x i64> %1, %2
-  %shr = lshr <64 x i64> %mul, %splat
+  %shr = lshr <64 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
    %res = trunc <64 x i64> %shr to <64 x i32>
    store <64 x i32> %res, <64 x i32>* %a
    ret void
@@ -1166,12 +896,10 @@ define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
  ; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
  ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
  ; CHECK-NEXT:    ret
-  %insert = insertelement <1 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer
    %1 = zext <1 x i64> %op1 to <1 x i128>
    %2 = zext <1 x i64> %op2 to <1 x i128>
    %mul = mul <1 x i128> %1, %2
-  %shr = lshr <1 x i128> %mul, %splat
+  %shr = lshr <1 x i128> %mul, <i128 64>
    %res = trunc <1 x i128> %shr to <1 x i64>
    ret <1 x i64> %res
  }
@@ -1186,12 +914,10 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
  ; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
  ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
  ; CHECK-NEXT:    ret
-  %insert = insertelement <2 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <2 x i128> %insert, <2 x i128> undef, <2 x i32> zeroinitializer
    %1 = zext <2 x i64> %op1 to <2 x i128>
    %2 = zext <2 x i64> %op2 to <2 x i128>
    %mul = mul <2 x i128> %1, %2
-  %shr = lshr <2 x i128> %mul, %splat
+  %shr = lshr <2 x i128> %mul, <i128 64, i128 64>
    %res = trunc <2 x i128> %shr to <2 x i64>
    ret <2 x i64> %res
  }
@@ -1207,12 +933,10 @@ define void @umulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
  ; CHECK-NEXT:    ret
    %op1 = load <4 x i64>, <4 x i64>* %a
    %op2 = load <4 x i64>, <4 x i64>* %b
-  %insert = insertelement <4 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <4 x i128> %insert, <4 x i128> undef, <4 x i32> zeroinitializer
    %1 = zext <4 x i64> %op1 to <4 x i128>
    %2 = zext <4 x i64> %op2 to <4 x i128>
    %mul = mul <4 x i128> %1, %2
-  %shr = lshr <4 x i128> %mul, %splat
+  %shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64>
    %res = trunc <4 x i128> %shr to <4 x i64>
    store <4 x i64> %res, <4 x i64>* %a
    ret void
@@ -1229,12 +953,10 @@ define void @umulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
  ; VBITS_GE_512-NEXT:    ret
    %op1 = load <8 x i64>, <8 x i64>* %a
    %op2 = load <8 x i64>, <8 x i64>* %b
-  %insert = insertelement <8 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <8 x i128> %insert, <8 x i128> undef, <8 x i32> zeroinitializer
    %1 = zext <8 x i64> %op1 to <8 x i128>
    %2 = zext <8 x i64> %op2 to <8 x i128>
    %mul = mul <8 x i128> %1, %2
-  %shr = lshr <8 x i128> %mul, %splat
+  %shr = lshr <8 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
    %res = trunc <8 x i128> %shr to <8 x i64>
    store <8 x i64> %res, <8 x i64>* %a
    ret void
@@ -1251,12 +973,10 @@ define void @umulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
  ; VBITS_GE_1024-NEXT:    ret
    %op1 = load <16 x i64>, <16 x i64>* %a
    %op2 = load <16 x i64>, <16 x i64>* %b
-  %insert = insertelement <16 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <16 x i128> %insert, <16 x i128> undef, <16 x i32> zeroinitializer
    %1 = zext <16 x i64> %op1 to <16 x i128>
    %2 = zext <16 x i64> %op2 to <16 x i128>
    %mul = mul <16 x i128> %1, %2
-  %shr = lshr <16 x i128> %mul, %splat
+  %shr = lshr <16 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
    %res = trunc <16 x i128> %shr to <16 x i64>
    store <16 x i64> %res, <16 x i64>* %a
    ret void
@@ -1273,12 +993,10 @@ define void @umulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
  ; VBITS_GE_2048-NEXT:    ret
    %op1 = load <32 x i64>, <32 x i64>* %a
    %op2 = load <32 x i64>, <32 x i64>* %b
-  %insert = insertelement <32 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <32 x i128> %insert, <32 x i128> undef, <32 x i32> zeroinitializer
    %1 = zext <32 x i64> %op1 to <32 x i128>
    %2 = zext <32 x i64> %op2 to <32 x i128>
    %mul = mul <32 x i128> %1, %2
-  %shr = lshr <32 x i128> %mul, %splat
+  %shr = lshr <32 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
    %res = trunc <32 x i128> %shr to <32 x i64>
    store <32 x i64> %res, <32 x i64>* %a
    ret void
author	David Truby <david.truby@arm.com>
	Fri, 14 Jan 2022 14:20:44 +0000 (14:20 +0000)
committer	David Truby <david.truby@arm.com>
	Fri, 14 Jan 2022 14:21:36 +0000 (14:21 +0000)