; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
+; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
+; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
%1 = sext <8 x i8> %op1 to <8 x i16>
%2 = sext <8 x i8> %op2 to <8 x i16>
%mul = mul <8 x i16> %1, %2
- %shr = lshr <8 x i16> %mul, %splat
+ %shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <8 x i16> %shr to <8 x i8>
ret <8 x i8> %res
}
; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
; CHECK-NEXT: uzp2 v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
- %insert = insertelement <16 x i16> undef, i16 8, i64 0
- %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
%1 = sext <16 x i8> %op1 to <16 x i16>
%2 = sext <16 x i8> %op2 to <16 x i16>
%mul = mul <16 x i16> %1, %2
- %shr = lshr <16 x i16> %mul, %splat
+ %shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <16 x i16> %shr to <16 x i8>
ret <16 x i8> %res
}
define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
-; VBITS_EQ_256-LABEL: smulh_v32i8:
-; VBITS_EQ_256: // %bb.0:
-; VBITS_EQ_256-NEXT: ptrue p0.b, vl32
-; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x1]
-; VBITS_EQ_256-NEXT: smulh z0.b, p0/m, z0.b, z1.b
-; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_EQ_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: smulh_v32i8:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ptrue p0.h, vl32
-; VBITS_GE_512-NEXT: ld1sb { z0.h }, p0/z, [x0]
-; VBITS_GE_512-NEXT: ld1sb { z1.h }, p0/z, [x1]
-; VBITS_GE_512-NEXT: mul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_512-NEXT: lsr z0.h, p0/m, z0.h, #8
-; VBITS_GE_512-NEXT: st1b { z0.h }, p0, [x0]
-; VBITS_GE_512-NEXT: ret
-
+; VBITS_GE_256-LABEL: smulh_v32i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: smulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
- %insert = insertelement <32 x i16> undef, i16 8, i64 0
- %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
%1 = sext <32 x i8> %op1 to <32 x i16>
%2 = sext <32 x i8> %op2 to <32 x i16>
%mul = mul <32 x i16> %1, %2
- %shr = lshr <32 x i16> %mul, %splat
+ %shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <32 x i16> %shr to <32 x i8>
store <32 x i8> %res, <32 x i8>* %a
ret void
}
define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; VBITS_EQ_512-LABEL: smulh_v64i8:
-; VBITS_EQ_512: // %bb.0:
-; VBITS_EQ_512-NEXT: ptrue p0.b, vl64
-; VBITS_EQ_512-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_EQ_512-NEXT: ld1b { z1.b }, p0/z, [x1]
-; VBITS_EQ_512-NEXT: smulh z0.b, p0/m, z0.b, z1.b
-; VBITS_EQ_512-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_EQ_512-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: smulh_v64i8:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: ld1sb { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1sb { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: mul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_1024-NEXT: lsr z0.h, p0/m, z0.h, #8
-; VBITS_GE_1024-NEXT: st1b { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
-
+; VBITS_GE_512-LABEL: smulh_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: smulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%insert = insertelement <64 x i16> undef, i16 8, i64 0
%1 = sext <64 x i8> %op1 to <64 x i16>
%2 = sext <64 x i8> %op2 to <64 x i16>
%mul = mul <64 x i16> %1, %2
- %shr = lshr <64 x i16> %mul, %splat
+ %shr = lshr <64 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <64 x i16> %shr to <64 x i8>
store <64 x i8> %res, <64 x i8>* %a
ret void
}
define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
-; VBITS_EQ_1024-LABEL: smulh_v128i8:
-; VBITS_EQ_1024: // %bb.0:
-; VBITS_EQ_1024-NEXT: ptrue p0.b, vl128
-; VBITS_EQ_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1b { z1.b }, p0/z, [x1]
-; VBITS_EQ_1024-NEXT: smulh z0.b, p0/m, z0.b, z1.b
-; VBITS_EQ_1024-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_EQ_1024-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: smulh_v128i8:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1sb { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: mul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_2048-NEXT: lsr z0.h, p0/m, z0.h, #8
-; VBITS_GE_2048-NEXT: st1b { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+; VBITS_GE_1024-LABEL: smulh_v128i8:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: ptrue p0.b, vl128
+; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_1024-NEXT: smulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_1024-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
- %insert = insertelement <128 x i16> undef, i16 8, i64 0
- %splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer
%1 = sext <128 x i8> %op1 to <128 x i16>
%2 = sext <128 x i8> %op2 to <128 x i16>
%mul = mul <128 x i16> %1, %2
- %shr = lshr <128 x i16> %mul, %splat
+ %shr = lshr <128 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <128 x i16> %shr to <128 x i8>
store <128 x i8> %res, <128 x i8>* %a
ret void
; VBITS_GE_2048-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
- %insert = insertelement <256 x i16> undef, i16 8, i64 0
- %splat = shufflevector <256 x i16> %insert, <256 x i16> undef, <256 x i32> zeroinitializer
%1 = sext <256 x i8> %op1 to <256 x i16>
%2 = sext <256 x i8> %op2 to <256 x i16>
%mul = mul <256 x i16> %1, %2
- %shr = lshr <256 x i16> %mul, %splat
+ %shr = lshr <256 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <256 x i16> %shr to <256 x i8>
store <256 x i8> %res, <256 x i8>* %a
ret void
; CHECK-NEXT: mov v0.h[3], w8
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
- %insert = insertelement <4 x i32> undef, i32 16, i64 0
- %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
%1 = sext <4 x i16> %op1 to <4 x i32>
%2 = sext <4 x i16> %op2 to <4 x i32>
%mul = mul <4 x i32> %1, %2
- %shr = lshr <4 x i32> %mul, %splat
+ %shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16>
%res = trunc <4 x i32> %shr to <4 x i16>
ret <4 x i16> %res
}
; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
; CHECK-NEXT: ret
- %insert = insertelement <8 x i32> undef, i32 16, i64 0
- %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
%1 = sext <8 x i16> %op1 to <8 x i32>
%2 = sext <8 x i16> %op2 to <8 x i32>
%mul = mul <8 x i32> %1, %2
- %shr = lshr <8 x i32> %mul, %splat
+ %shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%res = trunc <8 x i32> %shr to <8 x i16>
ret <8 x i16> %res
}
define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
-; VBITS_EQ_256-LABEL: smulh_v16i16:
-; VBITS_EQ_256: // %bb.0:
-; VBITS_EQ_256-NEXT: ptrue p0.h, vl16
-; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_EQ_256-NEXT: smulh z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_EQ_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: smulh_v16i16:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ptrue p0.s, vl16
-; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT: ld1sh { z1.s }, p0/z, [x1]
-; VBITS_GE_512-NEXT: mul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_512-NEXT: lsr z0.s, p0/m, z0.s, #16
-; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x0]
-; VBITS_GE_512-NEXT: ret
-
+; VBITS_GE_256-LABEL: smulh_v16i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: smulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
- %insert = insertelement <16 x i32> undef, i32 16, i64 0
- %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
%1 = sext <16 x i16> %op1 to <16 x i32>
%2 = sext <16 x i16> %op2 to <16 x i32>
%mul = mul <16 x i32> %1, %2
- %shr = lshr <16 x i32> %mul, %splat
+ %shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%res = trunc <16 x i32> %shr to <16 x i16>
store <16 x i16> %res, <16 x i16>* %a
ret void
}
define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; VBITS_EQ_512-LABEL: smulh_v32i16:
-; VBITS_EQ_512: // %bb.0:
-; VBITS_EQ_512-NEXT: ptrue p0.h, vl32
-; VBITS_EQ_512-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_EQ_512-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_EQ_512-NEXT: smulh z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_512-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_EQ_512-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: smulh_v32i16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1sh { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: mul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_1024-NEXT: lsr z0.s, p0/m, z0.s, #16
-; VBITS_GE_1024-NEXT: st1h { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
-
+; VBITS_GE_512-LABEL: smulh_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: smulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
- %insert = insertelement <32 x i32> undef, i32 16, i64 0
- %splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer
%1 = sext <32 x i16> %op1 to <32 x i32>
%2 = sext <32 x i16> %op2 to <32 x i32>
%mul = mul <32 x i32> %1, %2
- %shr = lshr <32 x i32> %mul, %splat
+ %shr = lshr <32 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%res = trunc <32 x i32> %shr to <32 x i16>
store <32 x i16> %res, <32 x i16>* %a
ret void
}
define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
-; VBITS_EQ_1024-LABEL: smulh_v64i16:
-; VBITS_EQ_1024: // %bb.0:
-; VBITS_EQ_1024-NEXT: ptrue p0.h, vl64
-; VBITS_EQ_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_EQ_1024-NEXT: smulh z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_1024-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_EQ_1024-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: smulh_v64i16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1sh { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: mul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_2048-NEXT: lsr z0.s, p0/m, z0.s, #16
-; VBITS_GE_2048-NEXT: st1h { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
-
+; VBITS_GE_1024-LABEL: smulh_v64i16:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
+; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_1024-NEXT: smulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_1024-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
- %insert = insertelement <64 x i32> undef, i32 16, i64 0
- %splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer
%1 = sext <64 x i16> %op1 to <64 x i32>
%2 = sext <64 x i16> %op2 to <64 x i32>
%mul = mul <64 x i32> %1, %2
- %shr = lshr <64 x i32> %mul, %splat
+ %shr = lshr <64 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%res = trunc <64 x i32> %shr to <64 x i16>
store <64 x i16> %res, <64 x i16>* %a
ret void
; VBITS_GE_2048-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
- %insert = insertelement <128 x i32> undef, i32 16, i64 0
- %splat = shufflevector <128 x i32> %insert, <128 x i32> undef, <128 x i32> zeroinitializer
%1 = sext <128 x i16> %op1 to <128 x i32>
%2 = sext <128 x i16> %op2 to <128 x i32>
%mul = mul <128 x i32> %1, %2
- %shr = lshr <128 x i32> %mul, %splat
+ %shr = lshr <128 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%res = trunc <128 x i32> %shr to <128 x i16>
store <128 x i16> %res, <128 x i16>* %a
ret void
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: shrn v0.2s, v0.2d, #32
; CHECK-NEXT: ret
- %insert = insertelement <2 x i64> undef, i64 32, i64 0
- %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
%1 = sext <2 x i32> %op1 to <2 x i64>
%2 = sext <2 x i32> %op2 to <2 x i64>
%mul = mul <2 x i64> %1, %2
- %shr = lshr <2 x i64> %mul, %splat
+ %shr = lshr <2 x i64> %mul, <i64 32, i64 32>
%res = trunc <2 x i64> %shr to <2 x i32>
ret <2 x i32> %res
}
; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s
; CHECK-NEXT: ret
- %insert = insertelement <4 x i64> undef, i64 32, i64 0
- %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
%1 = sext <4 x i32> %op1 to <4 x i64>
%2 = sext <4 x i32> %op2 to <4 x i64>
%mul = mul <4 x i64> %1, %2
- %shr = lshr <4 x i64> %mul, %splat
+ %shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32>
%res = trunc <4 x i64> %shr to <4 x i32>
ret <4 x i32> %res
}
define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
-; VBITS_EQ_256-LABEL: smulh_v8i32:
-; VBITS_EQ_256: // %bb.0:
-; VBITS_EQ_256-NEXT: ptrue p0.s, vl8
-; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_EQ_256-NEXT: smulh z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_EQ_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: smulh_v8i32:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT: ld1sw { z1.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT: mul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_512-NEXT: lsr z0.d, p0/m, z0.d, #32
-; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x0]
-; VBITS_GE_512-NEXT: ret
-
+; VBITS_GE_256-LABEL: smulh_v8i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: smulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
- %insert = insertelement <8 x i64> undef, i64 32, i64 0
- %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
%1 = sext <8 x i32> %op1 to <8 x i64>
%2 = sext <8 x i32> %op2 to <8 x i64>
%mul = mul <8 x i64> %1, %2
- %shr = lshr <8 x i64> %mul, %splat
+ %shr = lshr <8 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
%res = trunc <8 x i64> %shr to <8 x i32>
store <8 x i32> %res, <8 x i32>* %a
ret void
}
define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; VBITS_EQ_512-LABEL: smulh_v16i32:
-; VBITS_EQ_512: // %bb.0:
-; VBITS_EQ_512-NEXT: ptrue p0.s, vl16
-; VBITS_EQ_512-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_512-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_EQ_512-NEXT: smulh z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_512-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_EQ_512-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: smulh_v16i32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1sw { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1sw { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: mul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_1024-NEXT: lsr z0.d, p0/m, z0.d, #32
-; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
-
+; VBITS_GE_512-LABEL: smulh_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: smulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
- %insert = insertelement <16 x i64> undef, i64 32, i64 0
- %splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer
%1 = sext <16 x i32> %op1 to <16 x i64>
%2 = sext <16 x i32> %op2 to <16 x i64>
%mul = mul <16 x i64> %1, %2
- %shr = lshr <16 x i64> %mul, %splat
+ %shr = lshr <16 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
%res = trunc <16 x i64> %shr to <16 x i32>
store <16 x i32> %res, <16 x i32>* %a
ret void
}
define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
-; VBITS_EQ_1024-LABEL: smulh_v32i32:
-; VBITS_EQ_1024: // %bb.0:
-; VBITS_EQ_1024-NEXT: ptrue p0.s, vl32
-; VBITS_EQ_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_EQ_1024-NEXT: smulh z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_1024-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_EQ_1024-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: smulh_v32i32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1sw { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: mul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_2048-NEXT: lsr z0.d, p0/m, z0.d, #32
-; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
-
+; VBITS_GE_1024-LABEL: smulh_v32i32:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_1024-NEXT: smulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_1024-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
- %insert = insertelement <32 x i64> undef, i64 32, i64 0
- %splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer
%1 = sext <32 x i32> %op1 to <32 x i64>
%2 = sext <32 x i32> %op2 to <32 x i64>
%mul = mul <32 x i64> %1, %2
- %shr = lshr <32 x i64> %mul, %splat
+ %shr = lshr <32 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
%res = trunc <32 x i64> %shr to <32 x i32>
store <32 x i32> %res, <32 x i32>* %a
ret void
; VBITS_GE_2048-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
- %insert = insertelement <64 x i64> undef, i64 32, i64 0
- %splat = shufflevector <64 x i64> %insert, <64 x i64> undef, <64 x i32> zeroinitializer
%1 = sext <64 x i32> %op1 to <64 x i64>
%2 = sext <64 x i32> %op2 to <64 x i64>
%mul = mul <64 x i64> %1, %2
- %shr = lshr <64 x i64> %mul, %splat
+ %shr = lshr <64 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
%res = trunc <64 x i64> %shr to <64 x i32>
store <64 x i32> %res, <64 x i32>* %a
ret void
; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
- %insert = insertelement <2 x i128> undef, i128 64, i128 0
- %splat = shufflevector <2 x i128> %insert, <2 x i128> undef, <2 x i32> zeroinitializer
%1 = sext <2 x i64> %op1 to <2 x i128>
%2 = sext <2 x i64> %op2 to <2 x i128>
%mul = mul <2 x i128> %1, %2
- %shr = lshr <2 x i128> %mul, %splat
+ %shr = lshr <2 x i128> %mul, <i128 64, i128 64>
%res = trunc <2 x i128> %shr to <2 x i64>
ret <2 x i64> %res
}
; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
- %insert = insertelement <4 x i128> undef, i128 64, i128 0
- %splat = shufflevector <4 x i128> %insert, <4 x i128> undef, <4 x i32> zeroinitializer
%1 = sext <4 x i64> %op1 to <4 x i128>
%2 = sext <4 x i64> %op2 to <4 x i128>
%mul = mul <4 x i128> %1, %2
- %shr = lshr <4 x i128> %mul, %splat
+ %shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64>
%res = trunc <4 x i128> %shr to <4 x i64>
store <4 x i64> %res, <4 x i64>* %a
ret void
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
- %insert = insertelement <8 x i128> undef, i128 64, i128 0
- %splat = shufflevector <8 x i128> %insert, <8 x i128> undef, <8 x i32> zeroinitializer
%1 = sext <8 x i64> %op1 to <8 x i128>
%2 = sext <8 x i64> %op2 to <8 x i128>
%mul = mul <8 x i128> %1, %2
- %shr = lshr <8 x i128> %mul, %splat
+ %shr = lshr <8 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
%res = trunc <8 x i128> %shr to <8 x i64>
store <8 x i64> %res, <8 x i64>* %a
ret void
; VBITS_GE_1024-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
- %insert = insertelement <16 x i128> undef, i128 64, i128 0
- %splat = shufflevector <16 x i128> %insert, <16 x i128> undef, <16 x i32> zeroinitializer
%1 = sext <16 x i64> %op1 to <16 x i128>
%2 = sext <16 x i64> %op2 to <16 x i128>
%mul = mul <16 x i128> %1, %2
- %shr = lshr <16 x i128> %mul, %splat
+ %shr = lshr <16 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
%res = trunc <16 x i128> %shr to <16 x i64>
store <16 x i64> %res, <16 x i64>* %a
ret void
; VBITS_GE_2048-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
- %insert = insertelement <32 x i128> undef, i128 64, i128 0
- %splat = shufflevector <32 x i128> %insert, <32 x i128> undef, <32 x i32> zeroinitializer
%1 = sext <32 x i64> %op1 to <32 x i128>
%2 = sext <32 x i64> %op2 to <32 x i128>
%mul = mul <32 x i128> %1, %2
- %shr = lshr <32 x i128> %mul, %splat
+ %shr = lshr <32 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
%res = trunc <32 x i128> %shr to <32 x i64>
store <32 x i64> %res, <32 x i64>* %a
ret void
; CHECK-NEXT: mov v0.b[7], w8
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
- %insert = insertelement <8 x i16> undef, i16 8, i64 0
- %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
%1 = zext <8 x i8> %op1 to <8 x i16>
%2 = zext <8 x i8> %op2 to <8 x i16>
%mul = mul <8 x i16> %1, %2
- %shr = lshr <8 x i16> %mul, %splat
+ %shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <8 x i16> %shr to <8 x i8>
ret <8 x i8> %res
}
; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
; CHECK-NEXT: uzp2 v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
- %insert = insertelement <16 x i16> undef, i16 8, i64 0
- %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
%1 = zext <16 x i8> %op1 to <16 x i16>
%2 = zext <16 x i8> %op2 to <16 x i16>
%mul = mul <16 x i16> %1, %2
- %shr = lshr <16 x i16> %mul, %splat
+ %shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <16 x i16> %shr to <16 x i8>
ret <16 x i8> %res
}
define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
-; VBITS_EQ_256-LABEL: umulh_v32i8:
-; VBITS_EQ_256: // %bb.0:
-; VBITS_EQ_256-NEXT: ptrue p0.b, vl32
-; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x1]
-; VBITS_EQ_256-NEXT: umulh z0.b, p0/m, z0.b, z1.b
-; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_EQ_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: umulh_v32i8:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ptrue p0.h, vl32
-; VBITS_GE_512-NEXT: ld1b { z0.h }, p0/z, [x0]
-; VBITS_GE_512-NEXT: ld1b { z1.h }, p0/z, [x1]
-; VBITS_GE_512-NEXT: mul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_512-NEXT: lsr z0.h, p0/m, z0.h, #8
-; VBITS_GE_512-NEXT: st1b { z0.h }, p0, [x0]
-; VBITS_GE_512-NEXT: ret
-
+; VBITS_GE_256-LABEL: umulh_v32i8:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.b, vl32
+; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT: umulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
- %insert = insertelement <32 x i16> undef, i16 8, i64 0
- %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
%1 = zext <32 x i8> %op1 to <32 x i16>
%2 = zext <32 x i8> %op2 to <32 x i16>
%mul = mul <32 x i16> %1, %2
- %shr = lshr <32 x i16> %mul, %splat
+ %shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <32 x i16> %shr to <32 x i8>
store <32 x i8> %res, <32 x i8>* %a
ret void
}
define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; VBITS_EQ_512-LABEL: umulh_v64i8:
-; VBITS_EQ_512: // %bb.0:
-; VBITS_EQ_512-NEXT: ptrue p0.b, vl64
-; VBITS_EQ_512-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_EQ_512-NEXT: ld1b { z1.b }, p0/z, [x1]
-; VBITS_EQ_512-NEXT: umulh z0.b, p0/m, z0.b, z1.b
-; VBITS_EQ_512-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_EQ_512-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: umulh_v64i8:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: ld1b { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1b { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: mul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_1024-NEXT: lsr z0.h, p0/m, z0.h, #8
-; VBITS_GE_1024-NEXT: st1b { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
-
+; VBITS_GE_512-LABEL: umulh_v64i8:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl64
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: umulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
- %insert = insertelement <64 x i16> undef, i16 8, i64 0
- %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer
%1 = zext <64 x i8> %op1 to <64 x i16>
%2 = zext <64 x i8> %op2 to <64 x i16>
%mul = mul <64 x i16> %1, %2
- %shr = lshr <64 x i16> %mul, %splat
+ %shr = lshr <64 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <64 x i16> %shr to <64 x i8>
store <64 x i8> %res, <64 x i8>* %a
ret void
}
define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
-; VBITS_EQ_1024-LABEL: umulh_v128i8:
-; VBITS_EQ_1024: // %bb.0:
-; VBITS_EQ_1024-NEXT: ptrue p0.b, vl128
-; VBITS_EQ_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1b { z1.b }, p0/z, [x1]
-; VBITS_EQ_1024-NEXT: umulh z0.b, p0/m, z0.b, z1.b
-; VBITS_EQ_1024-NEXT: st1b { z0.b }, p0, [x0]
-; VBITS_EQ_1024-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: umulh_v128i8:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: ld1b { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1b { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: mul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_2048-NEXT: lsr z0.h, p0/m, z0.h, #8
-; VBITS_GE_2048-NEXT: st1b { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
+; VBITS_GE_1024-LABEL: umulh_v128i8:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: ptrue p0.b, vl128
+; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_1024-NEXT: umulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x0]
+; VBITS_GE_1024-NEXT: ret
%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b
%1 = zext <128 x i8> %op1 to <128 x i16>
%2 = zext <128 x i8> %op2 to <128 x i16>
%mul = mul <128 x i16> %1, %2
- %shr = lshr <128 x i16> %mul, %splat
+ %shr = lshr <128 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <128 x i16> %shr to <128 x i8>
store <128 x i8> %res, <128 x i8>* %a
ret void
; VBITS_GE_2048-NEXT: ret
%op1 = load <256 x i8>, <256 x i8>* %a
%op2 = load <256 x i8>, <256 x i8>* %b
- %insert = insertelement <256 x i16> undef, i16 8, i64 0
- %splat = shufflevector <256 x i16> %insert, <256 x i16> undef, <256 x i32> zeroinitializer
%1 = zext <256 x i8> %op1 to <256 x i16>
%2 = zext <256 x i8> %op2 to <256 x i16>
%mul = mul <256 x i16> %1, %2
- %shr = lshr <256 x i16> %mul, %splat
+ %shr = lshr <256 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <256 x i16> %shr to <256 x i8>
store <256 x i8> %res, <256 x i8>* %a
ret void
; CHECK-NEXT: mov v0.h[3], w8
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
- %insert = insertelement <4 x i32> undef, i32 16, i64 0
- %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
%1 = zext <4 x i16> %op1 to <4 x i32>
%2 = zext <4 x i16> %op2 to <4 x i32>
%mul = mul <4 x i32> %1, %2
- %shr = lshr <4 x i32> %mul, %splat
+ %shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16>
%res = trunc <4 x i32> %shr to <4 x i16>
ret <4 x i16> %res
}
; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
; CHECK-NEXT: ret
- %insert = insertelement <8 x i32> undef, i32 16, i64 0
- %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
%1 = zext <8 x i16> %op1 to <8 x i32>
%2 = zext <8 x i16> %op2 to <8 x i32>
%mul = mul <8 x i32> %1, %2
- %shr = lshr <8 x i32> %mul, %splat
+ %shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%res = trunc <8 x i32> %shr to <8 x i16>
ret <8 x i16> %res
}
define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
-; VBITS_EQ_256-LABEL: umulh_v16i16:
-; VBITS_EQ_256: // %bb.0:
-; VBITS_EQ_256-NEXT: ptrue p0.h, vl16
-; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_EQ_256-NEXT: umulh z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_EQ_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: umulh_v16i16:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ptrue p0.s, vl16
-; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT: ld1h { z1.s }, p0/z, [x1]
-; VBITS_GE_512-NEXT: mul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_512-NEXT: lsr z0.s, p0/m, z0.s, #16
-; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x0]
-; VBITS_GE_512-NEXT: ret
-
+; VBITS_GE_256-LABEL: umulh_v16i16:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.h, vl16
+; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT: umulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
- %insert = insertelement <16 x i32> undef, i32 16, i64 0
- %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
%1 = zext <16 x i16> %op1 to <16 x i32>
%2 = zext <16 x i16> %op2 to <16 x i32>
%mul = mul <16 x i32> %1, %2
- %shr = lshr <16 x i32> %mul, %splat
+ %shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%res = trunc <16 x i32> %shr to <16 x i16>
store <16 x i16> %res, <16 x i16>* %a
ret void
}
define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; VBITS_EQ_512-LABEL: umulh_v32i16:
-; VBITS_EQ_512: // %bb.0:
-; VBITS_EQ_512-NEXT: ptrue p0.h, vl32
-; VBITS_EQ_512-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_EQ_512-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_EQ_512-NEXT: umulh z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_512-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_EQ_512-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: umulh_v32i16:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: ld1h { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1h { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: mul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_1024-NEXT: lsr z0.s, p0/m, z0.s, #16
-; VBITS_GE_1024-NEXT: st1h { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
-
+; VBITS_GE_512-LABEL: umulh_v32i16:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl32
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: umulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
- %insert = insertelement <32 x i32> undef, i32 16, i64 0
- %splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer
%1 = zext <32 x i16> %op1 to <32 x i32>
%2 = zext <32 x i16> %op2 to <32 x i32>
%mul = mul <32 x i32> %1, %2
- %shr = lshr <32 x i32> %mul, %splat
+ %shr = lshr <32 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%res = trunc <32 x i32> %shr to <32 x i16>
store <32 x i16> %res, <32 x i16>* %a
ret void
}
define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
-; VBITS_EQ_1024-LABEL: umulh_v64i16:
-; VBITS_EQ_1024: // %bb.0:
-; VBITS_EQ_1024-NEXT: ptrue p0.h, vl64
-; VBITS_EQ_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_EQ_1024-NEXT: umulh z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_1024-NEXT: st1h { z0.h }, p0, [x0]
-; VBITS_EQ_1024-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: umulh_v64i16:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1h { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1h { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: mul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_2048-NEXT: lsr z0.s, p0/m, z0.s, #16
-; VBITS_GE_2048-NEXT: st1h { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
-
+; VBITS_GE_1024-LABEL: umulh_v64i16:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
+; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_1024-NEXT: umulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_1024-NEXT: ret
%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b
- %insert = insertelement <64 x i32> undef, i32 16, i64 0
- %splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer
%1 = zext <64 x i16> %op1 to <64 x i32>
%2 = zext <64 x i16> %op2 to <64 x i32>
%mul = mul <64 x i32> %1, %2
- %shr = lshr <64 x i32> %mul, %splat
+ %shr = lshr <64 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%res = trunc <64 x i32> %shr to <64 x i16>
store <64 x i16> %res, <64 x i16>* %a
ret void
; VBITS_GE_2048-NEXT: ret
%op1 = load <128 x i16>, <128 x i16>* %a
%op2 = load <128 x i16>, <128 x i16>* %b
- %insert = insertelement <128 x i32> undef, i32 16, i64 0
- %splat = shufflevector <128 x i32> %insert, <128 x i32> undef, <128 x i32> zeroinitializer
%1 = zext <128 x i16> %op1 to <128 x i32>
%2 = zext <128 x i16> %op2 to <128 x i32>
%mul = mul <128 x i32> %1, %2
- %shr = lshr <128 x i32> %mul, %splat
+ %shr = lshr <128 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%res = trunc <128 x i32> %shr to <128 x i16>
store <128 x i16> %res, <128 x i16>* %a
ret void
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: shrn v0.2s, v0.2d, #32
; CHECK-NEXT: ret
- %insert = insertelement <2 x i64> undef, i64 32, i64 0
- %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
%1 = zext <2 x i32> %op1 to <2 x i64>
%2 = zext <2 x i32> %op2 to <2 x i64>
%mul = mul <2 x i64> %1, %2
- %shr = lshr <2 x i64> %mul, %splat
+ %shr = lshr <2 x i64> %mul, <i64 32, i64 32>
%res = trunc <2 x i64> %shr to <2 x i32>
ret <2 x i32> %res
}
; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s
; CHECK-NEXT: ret
- %insert = insertelement <4 x i64> undef, i64 32, i64 0
- %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
%1 = zext <4 x i32> %op1 to <4 x i64>
%2 = zext <4 x i32> %op2 to <4 x i64>
%mul = mul <4 x i64> %1, %2
- %shr = lshr <4 x i64> %mul, %splat
+ %shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32>
%res = trunc <4 x i64> %shr to <4 x i32>
ret <4 x i32> %res
}
define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
-; VBITS_EQ_256-LABEL: umulh_v8i32:
-; VBITS_EQ_256: // %bb.0:
-; VBITS_EQ_256-NEXT: ptrue p0.s, vl8
-; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_EQ_256-NEXT: umulh z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_EQ_256-NEXT: ret
-;
-; VBITS_GE_512-LABEL: umulh_v8i32:
-; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT: ld1w { z1.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT: mul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_512-NEXT: lsr z0.d, p0/m, z0.d, #32
-; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x0]
-; VBITS_GE_512-NEXT: ret
-
+; VBITS_GE_256-LABEL: umulh_v8i32:
+; VBITS_GE_256: // %bb.0:
+; VBITS_GE_256-NEXT: ptrue p0.s, vl8
+; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT: umulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT: ret
%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b
%insert = insertelement <8 x i64> undef, i64 32, i64 0
%1 = zext <8 x i32> %op1 to <8 x i64>
%2 = zext <8 x i32> %op2 to <8 x i64>
%mul = mul <8 x i64> %1, %2
- %shr = lshr <8 x i64> %mul, %splat
+ %shr = lshr <8 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
%res = trunc <8 x i64> %shr to <8 x i32>
store <8 x i32> %res, <8 x i32>* %a
ret void
}
define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; VBITS_EQ_512-LABEL: umulh_v16i32:
-; VBITS_EQ_512: // %bb.0:
-; VBITS_EQ_512-NEXT: ptrue p0.s, vl16
-; VBITS_EQ_512-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_512-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_EQ_512-NEXT: umulh z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_512-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_EQ_512-NEXT: ret
-;
-; VBITS_GE_1024-LABEL: umulh_v16i32:
-; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: ld1w { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ld1w { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT: mul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_1024-NEXT: lsr z0.d, p0/m, z0.d, #32
-; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT: ret
-
+; VBITS_GE_512-LABEL: umulh_v16i32:
+; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl16
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: umulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
- %insert = insertelement <16 x i64> undef, i64 32, i64 0
- %splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer
%1 = zext <16 x i32> %op1 to <16 x i64>
%2 = zext <16 x i32> %op2 to <16 x i64>
%mul = mul <16 x i64> %1, %2
- %shr = lshr <16 x i64> %mul, %splat
+ %shr = lshr <16 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
%res = trunc <16 x i64> %shr to <16 x i32>
store <16 x i32> %res, <16 x i32>* %a
ret void
}
define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
-; VBITS_EQ_1024-LABEL: umulh_v32i32:
-; VBITS_EQ_1024: // %bb.0:
-; VBITS_EQ_1024-NEXT: ptrue p0.s, vl32
-; VBITS_EQ_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
-; VBITS_EQ_1024-NEXT: umulh z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_1024-NEXT: st1w { z0.s }, p0, [x0]
-; VBITS_EQ_1024-NEXT: ret
-;
-; VBITS_GE_2048-LABEL: umulh_v32i32:
-; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ld1w { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: mul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_2048-NEXT: lsr z0.d, p0/m, z0.d, #32
-; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT: ret
-
+; VBITS_GE_1024-LABEL: umulh_v32i32:
+; VBITS_GE_1024: // %bb.0:
+; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_1024-NEXT: umulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0]
+; VBITS_GE_1024-NEXT: ret
%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b
- %insert = insertelement <32 x i64> undef, i64 32, i64 0
- %splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer
%1 = zext <32 x i32> %op1 to <32 x i64>
%2 = zext <32 x i32> %op2 to <32 x i64>
%mul = mul <32 x i64> %1, %2
- %shr = lshr <32 x i64> %mul, %splat
+ %shr = lshr <32 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
%res = trunc <32 x i64> %shr to <32 x i32>
store <32 x i32> %res, <32 x i32>* %a
ret void
; VBITS_GE_2048-NEXT: ret
%op1 = load <64 x i32>, <64 x i32>* %a
%op2 = load <64 x i32>, <64 x i32>* %b
- %insert = insertelement <64 x i64> undef, i64 32, i64 0
- %splat = shufflevector <64 x i64> %insert, <64 x i64> undef, <64 x i32> zeroinitializer
%1 = zext <64 x i32> %op1 to <64 x i64>
%2 = zext <64 x i32> %op2 to <64 x i64>
%mul = mul <64 x i64> %1, %2
- %shr = lshr <64 x i64> %mul, %splat
+ %shr = lshr <64 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
%res = trunc <64 x i64> %shr to <64 x i32>
store <64 x i32> %res, <64 x i32>* %a
ret void
; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
- %insert = insertelement <1 x i128> undef, i128 64, i128 0
- %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer
%1 = zext <1 x i64> %op1 to <1 x i128>
%2 = zext <1 x i64> %op2 to <1 x i128>
%mul = mul <1 x i128> %1, %2
- %shr = lshr <1 x i128> %mul, %splat
+ %shr = lshr <1 x i128> %mul, <i128 64>
%res = trunc <1 x i128> %shr to <1 x i64>
ret <1 x i64> %res
}
; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
- %insert = insertelement <2 x i128> undef, i128 64, i128 0
- %splat = shufflevector <2 x i128> %insert, <2 x i128> undef, <2 x i32> zeroinitializer
%1 = zext <2 x i64> %op1 to <2 x i128>
%2 = zext <2 x i64> %op2 to <2 x i128>
%mul = mul <2 x i128> %1, %2
- %shr = lshr <2 x i128> %mul, %splat
+ %shr = lshr <2 x i128> %mul, <i128 64, i128 64>
%res = trunc <2 x i128> %shr to <2 x i64>
ret <2 x i64> %res
}
; CHECK-NEXT: ret
%op1 = load <4 x i64>, <4 x i64>* %a
%op2 = load <4 x i64>, <4 x i64>* %b
- %insert = insertelement <4 x i128> undef, i128 64, i128 0
- %splat = shufflevector <4 x i128> %insert, <4 x i128> undef, <4 x i32> zeroinitializer
%1 = zext <4 x i64> %op1 to <4 x i128>
%2 = zext <4 x i64> %op2 to <4 x i128>
%mul = mul <4 x i128> %1, %2
- %shr = lshr <4 x i128> %mul, %splat
+ %shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64>
%res = trunc <4 x i128> %shr to <4 x i64>
store <4 x i64> %res, <4 x i64>* %a
ret void
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%op2 = load <8 x i64>, <8 x i64>* %b
- %insert = insertelement <8 x i128> undef, i128 64, i128 0
- %splat = shufflevector <8 x i128> %insert, <8 x i128> undef, <8 x i32> zeroinitializer
%1 = zext <8 x i64> %op1 to <8 x i128>
%2 = zext <8 x i64> %op2 to <8 x i128>
%mul = mul <8 x i128> %1, %2
- %shr = lshr <8 x i128> %mul, %splat
+ %shr = lshr <8 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
%res = trunc <8 x i128> %shr to <8 x i64>
store <8 x i64> %res, <8 x i64>* %a
ret void
; VBITS_GE_1024-NEXT: ret
%op1 = load <16 x i64>, <16 x i64>* %a
%op2 = load <16 x i64>, <16 x i64>* %b
- %insert = insertelement <16 x i128> undef, i128 64, i128 0
- %splat = shufflevector <16 x i128> %insert, <16 x i128> undef, <16 x i32> zeroinitializer
%1 = zext <16 x i64> %op1 to <16 x i128>
%2 = zext <16 x i64> %op2 to <16 x i128>
%mul = mul <16 x i128> %1, %2
- %shr = lshr <16 x i128> %mul, %splat
+ %shr = lshr <16 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
%res = trunc <16 x i128> %shr to <16 x i64>
store <16 x i64> %res, <16 x i64>* %a
ret void
; VBITS_GE_2048-NEXT: ret
%op1 = load <32 x i64>, <32 x i64>* %a
%op2 = load <32 x i64>, <32 x i64>* %b
- %insert = insertelement <32 x i128> undef, i128 64, i128 0
- %splat = shufflevector <32 x i128> %insert, <32 x i128> undef, <32 x i32> zeroinitializer
%1 = zext <32 x i64> %op1 to <32 x i128>
%2 = zext <32 x i64> %op2 to <32 x i128>
%mul = mul <32 x i128> %1, %2
- %shr = lshr <32 x i128> %mul, %splat
+ %shr = lshr <32 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
%res = trunc <32 x i128> %shr to <32 x i64>
store <32 x i64> %res, <32 x i64>* %a
ret void