From 8b831cbb2a45abd1979a5f82cba8159e1f94d379 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 13 Nov 2016 01:51:55 +0000 Subject: [PATCH] [InstCombine][AVX-512] Expand vector shift handling to work on the AVX-512 shift by immediate and shift by single value. This does not include support for the AVX-512 variable shifts. That will be coming in a future patch. llvm-svn: 286739 --- .../Transforms/InstCombine/InstCombineCalls.cpp | 46 +- .../Transforms/InstCombine/x86-vector-shifts.ll | 808 +++++++++++++++++++++ 2 files changed, 853 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index ebeba7e..d109df3 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -268,6 +268,16 @@ static Value *simplifyX86immShift(const IntrinsicInst &II, case Intrinsic::x86_avx2_psra_w: case Intrinsic::x86_avx2_psrai_d: case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx512_psra_q_128: + case Intrinsic::x86_avx512_psrai_q_128: + case Intrinsic::x86_avx512_psra_q_256: + case Intrinsic::x86_avx512_psrai_q_256: + case Intrinsic::x86_avx512_psra_d_512: + case Intrinsic::x86_avx512_psra_q_512: + case Intrinsic::x86_avx512_psra_w_512: + case Intrinsic::x86_avx512_psrai_d_512: + case Intrinsic::x86_avx512_psrai_q_512: + case Intrinsic::x86_avx512_psrai_w_512: LogicalShift = false; ShiftLeft = false; break; case Intrinsic::x86_sse2_psrl_d: @@ -282,6 +292,12 @@ static Value *simplifyX86immShift(const IntrinsicInst &II, case Intrinsic::x86_avx2_psrli_d: case Intrinsic::x86_avx2_psrli_q: case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx512_psrl_d_512: + case Intrinsic::x86_avx512_psrl_q_512: + case Intrinsic::x86_avx512_psrl_w_512: + case Intrinsic::x86_avx512_psrli_d_512: + case Intrinsic::x86_avx512_psrli_q_512: + case Intrinsic::x86_avx512_psrli_w_512: LogicalShift = true; ShiftLeft = false; break; case Intrinsic::x86_sse2_psll_d: @@ -296,6 +312,12 @@ static Value *simplifyX86immShift(const IntrinsicInst &II, case Intrinsic::x86_avx2_pslli_d: case Intrinsic::x86_avx2_pslli_q: case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx512_psll_d_512: + case Intrinsic::x86_avx512_psll_q_512: + case Intrinsic::x86_avx512_psll_w_512: + case Intrinsic::x86_avx512_pslli_d_512: + case Intrinsic::x86_avx512_pslli_q_512: + case Intrinsic::x86_avx512_pslli_w_512: LogicalShift = true; ShiftLeft = true; break; } @@ -1776,18 +1798,29 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_sse2_psrai_w: case Intrinsic::x86_avx2_psrai_d: case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx512_psrai_q_128: + case Intrinsic::x86_avx512_psrai_q_256: + case Intrinsic::x86_avx512_psrai_d_512: + case Intrinsic::x86_avx512_psrai_q_512: + case Intrinsic::x86_avx512_psrai_w_512: case Intrinsic::x86_sse2_psrli_d: case Intrinsic::x86_sse2_psrli_q: case Intrinsic::x86_sse2_psrli_w: case Intrinsic::x86_avx2_psrli_d: case Intrinsic::x86_avx2_psrli_q: case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx512_psrli_d_512: + case Intrinsic::x86_avx512_psrli_q_512: + case Intrinsic::x86_avx512_psrli_w_512: case Intrinsic::x86_sse2_pslli_d: case Intrinsic::x86_sse2_pslli_q: case Intrinsic::x86_sse2_pslli_w: case Intrinsic::x86_avx2_pslli_d: case Intrinsic::x86_avx2_pslli_q: case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx512_pslli_d_512: + case Intrinsic::x86_avx512_pslli_q_512: + case Intrinsic::x86_avx512_pslli_w_512: if (Value *V = simplifyX86immShift(*II, *Builder)) return replaceInstUsesWith(*II, V); break; @@ -1796,18 +1829,29 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_sse2_psra_w: case Intrinsic::x86_avx2_psra_d: case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx512_psra_q_128: + case Intrinsic::x86_avx512_psra_q_256: + case Intrinsic::x86_avx512_psra_d_512: + case Intrinsic::x86_avx512_psra_q_512: + case Intrinsic::x86_avx512_psra_w_512: case Intrinsic::x86_sse2_psrl_d: case Intrinsic::x86_sse2_psrl_q: case Intrinsic::x86_sse2_psrl_w: case Intrinsic::x86_avx2_psrl_d: case Intrinsic::x86_avx2_psrl_q: case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx512_psrl_d_512: + case Intrinsic::x86_avx512_psrl_q_512: + case Intrinsic::x86_avx512_psrl_w_512: case Intrinsic::x86_sse2_psll_d: case Intrinsic::x86_sse2_psll_q: case Intrinsic::x86_sse2_psll_w: case Intrinsic::x86_avx2_psll_d: case Intrinsic::x86_avx2_psll_q: - case Intrinsic::x86_avx2_psll_w: { + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx512_psll_d_512: + case Intrinsic::x86_avx512_psll_q_512: + case Intrinsic::x86_avx512_psll_w_512: { if (Value *V = simplifyX86immShift(*II, *Builder)) return replaceInstUsesWith(*II, V); diff --git a/llvm/test/Transforms/InstCombine/x86-vector-shifts.ll b/llvm/test/Transforms/InstCombine/x86-vector-shifts.ll index 012a826..51bf6ed 100644 --- a/llvm/test/Transforms/InstCombine/x86-vector-shifts.ll +++ b/llvm/test/Transforms/InstCombine/x86-vector-shifts.ll @@ -110,6 +110,136 @@ define <8 x i32> @avx2_psrai_d_64(<8 x i32> %v) { ret <8 x i32> %1 } +define <2 x i64> @avx512_psrai_q_128_0(<2 x i64> %v) { +; CHECK-LABEL: @avx512_psrai_q_128_0( +; CHECK-NEXT: ret <2 x i64> %v +; + %1 = tail call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %v, i32 0) + ret <2 x i64> %1 +} + +define <2 x i64> @avx512_psrai_q_128_15(<2 x i64> %v) { +; CHECK-LABEL: @avx512_psrai_q_128_15( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <2 x i64> %v, +; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; + %1 = tail call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %v, i32 15) + ret <2 x i64> %1 +} + +define <2 x i64> @avx512_psrai_q_128_64(<2 x i64> %v) { +; CHECK-LABEL: @avx512_psrai_q_128_64( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <2 x i64> %v, +; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; + %1 = tail call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %v, i32 64) + ret <2 x i64> %1 +} + +define <4 x i64> @avx512_psrai_q_256_0(<4 x i64> %v) { +; CHECK-LABEL: @avx512_psrai_q_256_0( +; CHECK-NEXT: ret <4 x i64> %v +; + %1 = tail call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %v, i32 0) + ret <4 x i64> %1 +} + +define <4 x i64> @avx512_psrai_q_256_15(<4 x i64> %v) { +; CHECK-LABEL: @avx512_psrai_q_256_15( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <4 x i64> %v, +; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; + %1 = tail call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %v, i32 15) + ret <4 x i64> %1 +} + +define <4 x i64> @avx512_psrai_q_256_64(<4 x i64> %v) { +; CHECK-LABEL: @avx512_psrai_q_256_64( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <4 x i64> %v, +; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; + %1 = tail call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %v, i32 64) + ret <4 x i64> %1 +} + +define <32 x i16> @avx512_psrai_w_512_0(<32 x i16> %v) { +; CHECK-LABEL: @avx512_psrai_w_512_0( +; CHECK-NEXT: ret <32 x i16> %v +; + %1 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %v, i32 0) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_psrai_w_512_15(<32 x i16> %v) { +; CHECK-LABEL: @avx512_psrai_w_512_15( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <32 x i16> %v, +; CHECK-NEXT: ret <32 x i16> [[TMP1]] +; + %1 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %v, i32 15) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_psrai_w_512_64(<32 x i16> %v) { +; CHECK-LABEL: @avx512_psrai_w_512_64( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <32 x i16> %v, +; CHECK-NEXT: ret <32 x i16> [[TMP1]] +; + %1 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %v, i32 64) + ret <32 x i16> %1 +} + +define <16 x i32> @avx512_psrai_d_512_0(<16 x i32> %v) { +; CHECK-LABEL: @avx512_psrai_d_512_0( +; CHECK-NEXT: ret <16 x i32> %v +; + %1 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %v, i32 0) + ret <16 x i32> %1 +} + +define <16 x i32> @avx512_psrai_d_512_15(<16 x i32> %v) { +; CHECK-LABEL: @avx512_psrai_d_512_15( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <16 x i32> %v, +; CHECK-NEXT: ret <16 x i32> [[TMP1]] +; + %1 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %v, i32 15) + ret <16 x i32> %1 +} + +define <16 x i32> @avx512_psrai_d_512_64(<16 x i32> %v) { +; CHECK-LABEL: @avx512_psrai_d_512_64( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <16 x i32> %v, +; CHECK-NEXT: ret <16 x i32> [[TMP1]] +; + %1 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %v, i32 64) + ret <16 x i32> %1 +} + +define <8 x i64> @avx512_psrai_q_512_0(<8 x i64> %v) { +; CHECK-LABEL: @avx512_psrai_q_512_0( +; CHECK-NEXT: ret <8 x i64> %v +; + %1 = tail call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %v, i32 0) + ret <8 x i64> %1 +} + +define <8 x i64> @avx512_psrai_q_512_15(<8 x i64> %v) { +; CHECK-LABEL: @avx512_psrai_q_512_15( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <8 x i64> %v, +; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; + %1 = tail call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %v, i32 15) + ret <8 x i64> %1 +} + +define <8 x i64> @avx512_psrai_q_512_64(<8 x i64> %v) { +; CHECK-LABEL: @avx512_psrai_q_512_64( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <8 x i64> %v, +; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; + %1 = tail call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %v, i32 64) + ret <8 x i64> %1 +} + ; ; LSHR - Immediate ; @@ -264,6 +394,81 @@ define <4 x i64> @avx2_psrli_q_64(<4 x i64> %v) { ret <4 x i64> %1 } +define <32 x i16> @avx512_psrli_w_512_0(<32 x i16> %v) { +; CHECK-LABEL: @avx512_psrli_w_512_0( +; CHECK-NEXT: ret <32 x i16> %v +; + %1 = tail call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %v, i32 0) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_psrli_w_512_15(<32 x i16> %v) { +; CHECK-LABEL: @avx512_psrli_w_512_15( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <32 x i16> %v, +; CHECK-NEXT: ret <32 x i16> [[TMP1]] +; + %1 = tail call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %v, i32 15) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_psrli_w_512_64(<32 x i16> %v) { +; CHECK-LABEL: @avx512_psrli_w_512_64( +; CHECK-NEXT: ret <32 x i16> zeroinitializer +; + %1 = tail call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %v, i32 64) + ret <32 x i16> %1 +} + +define <16 x i32> @avx512_psrli_d_512_0(<16 x i32> %v) { +; CHECK-LABEL: @avx512_psrli_d_512_0( +; CHECK-NEXT: ret <16 x i32> %v +; + %1 = tail call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %v, i32 0) + ret <16 x i32> %1 +} + +define <16 x i32> @avx512_psrli_d_512_15(<16 x i32> %v) { +; CHECK-LABEL: @avx512_psrli_d_512_15( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <16 x i32> %v, +; CHECK-NEXT: ret <16 x i32> [[TMP1]] +; + %1 = tail call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %v, i32 15) + ret <16 x i32> %1 +} + +define <16 x i32> @avx512_psrli_d_512_64(<16 x i32> %v) { +; CHECK-LABEL: @avx512_psrli_d_512_64( +; CHECK-NEXT: ret <16 x i32> zeroinitializer +; + %1 = tail call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %v, i32 64) + ret <16 x i32> %1 +} + +define <8 x i64> @avx512_psrli_q_512_0(<8 x i64> %v) { +; CHECK-LABEL: @avx512_psrli_q_512_0( +; CHECK-NEXT: ret <8 x i64> %v +; + %1 = tail call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %v, i32 0) + ret <8 x i64> %1 +} + +define <8 x i64> @avx512_psrli_q_512_15(<8 x i64> %v) { +; CHECK-LABEL: @avx512_psrli_q_512_15( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <8 x i64> %v, +; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; + %1 = tail call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %v, i32 15) + ret <8 x i64> %1 +} + +define <8 x i64> @avx512_psrli_q_512_64(<8 x i64> %v) { +; CHECK-LABEL: @avx512_psrli_q_512_64( +; CHECK-NEXT: ret <8 x i64> zeroinitializer +; + %1 = tail call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %v, i32 64) + ret <8 x i64> %1 +} + ; ; SHL - Immediate ; @@ -418,6 +623,81 @@ define <4 x i64> @avx2_pslli_q_64(<4 x i64> %v) { ret <4 x i64> %1 } +define <32 x i16> @avx512_pslli_w_512_0(<32 x i16> %v) { +; CHECK-LABEL: @avx512_pslli_w_512_0( +; CHECK-NEXT: ret <32 x i16> %v +; + %1 = tail call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %v, i32 0) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_pslli_w_512_15(<32 x i16> %v) { +; CHECK-LABEL: @avx512_pslli_w_512_15( +; CHECK-NEXT: [[TMP1:%.*]] = shl <32 x i16> %v, +; CHECK-NEXT: ret <32 x i16> [[TMP1]] +; + %1 = tail call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %v, i32 15) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_pslli_w_512_64(<32 x i16> %v) { +; CHECK-LABEL: @avx512_pslli_w_512_64( +; CHECK-NEXT: ret <32 x i16> zeroinitializer +; + %1 = tail call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %v, i32 64) + ret <32 x i16> %1 +} + +define <16 x i32> @avx512_pslli_d_512_0(<16 x i32> %v) { +; CHECK-LABEL: @avx512_pslli_d_512_0( +; CHECK-NEXT: ret <16 x i32> %v +; + %1 = tail call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %v, i32 0) + ret <16 x i32> %1 +} + +define <16 x i32> @avx512_pslli_d_512_15(<16 x i32> %v) { +; CHECK-LABEL: @avx512_pslli_d_512_15( +; CHECK-NEXT: [[TMP1:%.*]] = shl <16 x i32> %v, +; CHECK-NEXT: ret <16 x i32> [[TMP1]] +; + %1 = tail call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %v, i32 15) + ret <16 x i32> %1 +} + +define <16 x i32> @avx512_pslli_d_512_64(<16 x i32> %v) { +; CHECK-LABEL: @avx512_pslli_d_512_64( +; CHECK-NEXT: ret <16 x i32> zeroinitializer +; + %1 = tail call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %v, i32 64) + ret <16 x i32> %1 +} + +define <8 x i64> @avx512_pslli_q_512_0(<8 x i64> %v) { +; CHECK-LABEL: @avx512_pslli_q_512_0( +; CHECK-NEXT: ret <8 x i64> %v +; + %1 = tail call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %v, i32 0) + ret <8 x i64> %1 +} + +define <8 x i64> @avx512_pslli_q_512_15(<8 x i64> %v) { +; CHECK-LABEL: @avx512_pslli_q_512_15( +; CHECK-NEXT: [[TMP1:%.*]] = shl <8 x i64> %v, +; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; + %1 = tail call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %v, i32 15) + ret <8 x i64> %1 +} + +define <8 x i64> @avx512_pslli_q_512_64(<8 x i64> %v) { +; CHECK-LABEL: @avx512_pslli_q_512_64( +; CHECK-NEXT: ret <8 x i64> zeroinitializer +; + %1 = tail call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %v, i32 64) + ret <8 x i64> %1 +} + ; ; ASHR - Constant Vector ; @@ -562,6 +842,154 @@ define <8 x i32> @avx2_psra_d_64(<8 x i32> %v) { ret <8 x i32> %1 } +define <2 x i64> @avx512_psra_q_128_0(<2 x i64> %v) { +; CHECK-LABEL: @avx512_psra_q_128_0( +; CHECK-NEXT: ret <2 x i64> %v +; + %1 = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> zeroinitializer) + ret <2 x i64> %1 +} + +define <2 x i64> @avx512_psra_q_128_15(<2 x i64> %v) { +; CHECK-LABEL: @avx512_psra_q_128_15( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <2 x i64> %v, +; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; + %1 = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> ) + ret <2 x i64> %1 +} + +define <2 x i64> @avx512_psra_q_128_64(<2 x i64> %v) { +; CHECK-LABEL: @avx512_psra_q_128_64( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <2 x i64> %v, +; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; + %1 = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> ) + ret <2 x i64> %1 +} + +define <4 x i64> @avx512_psra_q_256_0(<4 x i64> %v) { +; CHECK-LABEL: @avx512_psra_q_256_0( +; CHECK-NEXT: ret <4 x i64> %v +; + %1 = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> zeroinitializer) + ret <4 x i64> %1 +} + +define <4 x i64> @avx512_psra_q_256_15(<4 x i64> %v) { +; CHECK-LABEL: @avx512_psra_q_256_15( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <4 x i64> %v, +; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; + %1 = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> ) + ret <4 x i64> %1 +} + +define <4 x i64> @avx512_psra_q_256_64(<4 x i64> %v) { +; CHECK-LABEL: @avx512_psra_q_256_64( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <4 x i64> %v, +; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; + %1 = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> ) + ret <4 x i64> %1 +} + +define <32 x i16> @avx512_psra_w_512_0(<32 x i16> %v) { +; CHECK-LABEL: @avx512_psra_w_512_0( +; CHECK-NEXT: ret <32 x i16> %v +; + %1 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> zeroinitializer) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_psra_w_512_15(<32 x i16> %v) { +; CHECK-LABEL: @avx512_psra_w_512_15( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <32 x i16> %v, +; CHECK-NEXT: ret <32 x i16> [[TMP1]] +; + %1 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> ) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_psra_w_512_15_splat(<32 x i16> %v) { +; CHECK-LABEL: @avx512_psra_w_512_15_splat( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <32 x i16> %v, +; CHECK-NEXT: ret <32 x i16> [[TMP1]] +; + %1 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> ) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_psra_w_512_64(<32 x i16> %v) { +; CHECK-LABEL: @avx512_psra_w_512_64( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <32 x i16> %v, +; CHECK-NEXT: ret <32 x i16> [[TMP1]] +; + %1 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> ) + ret <32 x i16> %1 +} + +define <16 x i32> @avx512_psra_d_512_0(<16 x i32> %v) { +; CHECK-LABEL: @avx512_psra_d_512_0( +; CHECK-NEXT: ret <16 x i32> %v +; + %1 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> zeroinitializer) + ret <16 x i32> %1 +} + +define <16 x i32> @avx512_psra_d_512_15(<16 x i32> %v) { +; CHECK-LABEL: @avx512_psra_d_512_15( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <16 x i32> %v, +; CHECK-NEXT: ret <16 x i32> [[TMP1]] +; + %1 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> ) + ret <16 x i32> %1 +} + +define <16 x i32> @avx512_psra_d_512_15_splat(<16 x i32> %v) { +; CHECK-LABEL: @avx512_psra_d_512_15_splat( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <16 x i32> %v, +; CHECK-NEXT: ret <16 x i32> [[TMP1]] +; + %1 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> ) + ret <16 x i32> %1 +} + +define <16 x i32> @avx512_psra_d_512_64(<16 x i32> %v) { +; CHECK-LABEL: @avx512_psra_d_512_64( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <16 x i32> %v, +; CHECK-NEXT: ret <16 x i32> [[TMP1]] +; + %1 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> ) + ret <16 x i32> %1 +} + +define <8 x i64> @avx512_psra_q_512_0(<8 x i64> %v) { +; CHECK-LABEL: @avx512_psra_q_512_0( +; CHECK-NEXT: ret <8 x i64> %v +; + %1 = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> zeroinitializer) + ret <8 x i64> %1 +} + +define <8 x i64> @avx512_psra_q_512_15(<8 x i64> %v) { +; CHECK-LABEL: @avx512_psra_q_512_15( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <8 x i64> %v, +; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; + %1 = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> ) + ret <8 x i64> %1 +} + +define <8 x i64> @avx512_psra_q_512_64(<8 x i64> %v) { +; CHECK-LABEL: @avx512_psra_q_512_64( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <8 x i64> %v, +; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; + %1 = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> ) + ret <8 x i64> %1 +} + ; ; LSHR - Constant Vector ; @@ -748,6 +1176,97 @@ define <4 x i64> @avx2_psrl_q_64(<4 x i64> %v) { ret <4 x i64> %1 } +define <32 x i16> @avx512_psrl_w_512_0(<32 x i16> %v) { +; CHECK-LABEL: @avx512_psrl_w_512_0( +; CHECK-NEXT: ret <32 x i16> %v +; + %1 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> zeroinitializer) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_psrl_w_512_15(<32 x i16> %v) { +; CHECK-LABEL: @avx512_psrl_w_512_15( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <32 x i16> %v, +; CHECK-NEXT: ret <32 x i16> [[TMP1]] +; + %1 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> ) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_psrl_w_512_15_splat(<32 x i16> %v) { +; CHECK-LABEL: @avx512_psrl_w_512_15_splat( +; CHECK-NEXT: ret <32 x i16> zeroinitializer +; + %1 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> ) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_psrl_w_512_64(<32 x i16> %v) { +; CHECK-LABEL: @avx512_psrl_w_512_64( +; CHECK-NEXT: ret <32 x i16> zeroinitializer +; + %1 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> ) + ret <32 x i16> %1 +} + +define <16 x i32> @avx512_psrl_d_512_0(<16 x i32> %v) { +; CHECK-LABEL: @avx512_psrl_d_512_0( +; CHECK-NEXT: ret <16 x i32> %v +; + %1 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> zeroinitializer) + ret <16 x i32> %1 +} + +define <16 x i32> @avx512_psrl_d_512_15(<16 x i32> %v) { +; CHECK-LABEL: @avx512_psrl_d_512_15( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <16 x i32> %v, +; CHECK-NEXT: ret <16 x i32> [[TMP1]] +; + %1 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> ) + ret <16 x i32> %1 +} + +define <16 x i32> @avx512_psrl_d_512_15_splat(<16 x i32> %v) { +; CHECK-LABEL: @avx512_psrl_d_512_15_splat( +; CHECK-NEXT: ret <16 x i32> zeroinitializer +; + %1 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> ) + ret <16 x i32> %1 +} + +define <16 x i32> @avx512_psrl_d_512_64(<16 x i32> %v) { +; CHECK-LABEL: @avx512_psrl_d_512_64( +; CHECK-NEXT: ret <16 x i32> zeroinitializer +; + %1 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> ) + ret <16 x i32> %1 +} + +define <8 x i64> @avx512_psrl_q_512_0(<8 x i64> %v) { +; CHECK-LABEL: @avx512_psrl_q_512_0( +; CHECK-NEXT: ret <8 x i64> %v +; + %1 = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> zeroinitializer) + ret <8 x i64> %1 +} + +define <8 x i64> @avx512_psrl_q_512_15(<8 x i64> %v) { +; CHECK-LABEL: @avx512_psrl_q_512_15( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <8 x i64> %v, +; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; + %1 = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> ) + ret <8 x i64> %1 +} + +define <8 x i64> @avx512_psrl_q_512_64(<8 x i64> %v) { +; CHECK-LABEL: @avx512_psrl_q_512_64( +; CHECK-NEXT: ret <8 x i64> zeroinitializer +; + %1 = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> ) + ret <8 x i64> %1 +} + ; ; SHL - Constant Vector ; @@ -934,6 +1453,97 @@ define <4 x i64> @avx2_psll_q_64(<4 x i64> %v) { ret <4 x i64> %1 } +define <32 x i16> @avx512_psll_w_512_0(<32 x i16> %v) { +; CHECK-LABEL: @avx512_psll_w_512_0( +; CHECK-NEXT: ret <32 x i16> %v +; + %1 = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> zeroinitializer) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_psll_w_512_15(<32 x i16> %v) { +; CHECK-LABEL: @avx512_psll_w_512_15( +; CHECK-NEXT: [[TMP1:%.*]] = shl <32 x i16> %v, +; CHECK-NEXT: ret <32 x i16> [[TMP1]] +; + %1 = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> ) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_psll_w_15_512_splat(<32 x i16> %v) { +; CHECK-LABEL: @avx512_psll_w_15_512_splat( +; CHECK-NEXT: ret <32 x i16> zeroinitializer +; + %1 = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> ) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_psll_w_512_64(<32 x i16> %v) { +; CHECK-LABEL: @avx512_psll_w_512_64( +; CHECK-NEXT: ret <32 x i16> zeroinitializer +; + %1 = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> ) + ret <32 x i16> %1 +} + +define <16 x i32> @avx512_psll_d_512_0(<16 x i32> %v) { +; CHECK-LABEL: @avx512_psll_d_512_0( +; CHECK-NEXT: ret <16 x i32> %v +; + %1 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> zeroinitializer) + ret <16 x i32> %1 +} + +define <16 x i32> @avx512_psll_d_512_15(<16 x i32> %v) { +; CHECK-LABEL: @avx512_psll_d_512_15( +; CHECK-NEXT: [[TMP1:%.*]] = shl <16 x i32> %v, +; CHECK-NEXT: ret <16 x i32> [[TMP1]] +; + %1 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> ) + ret <16 x i32> %1 +} + +define <16 x i32> @avx512_psll_d_512_15_splat(<16 x i32> %v) { +; CHECK-LABEL: @avx512_psll_d_512_15_splat( +; CHECK-NEXT: ret <16 x i32> zeroinitializer +; + %1 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> ) + ret <16 x i32> %1 +} + +define <16 x i32> @avx512_psll_d_512_64(<16 x i32> %v) { +; CHECK-LABEL: @avx512_psll_d_512_64( +; CHECK-NEXT: ret <16 x i32> zeroinitializer +; + %1 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> ) + ret <16 x i32> %1 +} + +define <8 x i64> @avx512_psll_q_512_0(<8 x i64> %v) { +; CHECK-LABEL: @avx512_psll_q_512_0( +; CHECK-NEXT: ret <8 x i64> %v +; + %1 = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> zeroinitializer) + ret <8 x i64> %1 +} + +define <8 x i64> @avx512_psll_q_512_15(<8 x i64> %v) { +; CHECK-LABEL: @avx512_psll_q_512_15( +; CHECK-NEXT: [[TMP1:%.*]] = shl <8 x i64> %v, +; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; + %1 = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> ) + ret <8 x i64> %1 +} + +define <8 x i64> @avx512_psll_q_512_64(<8 x i64> %v) { +; CHECK-LABEL: @avx512_psll_q_512_64( +; CHECK-NEXT: ret <8 x i64> zeroinitializer +; + %1 = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> ) + ret <8 x i64> %1 +} + ; ; ASHR - Constant Per-Element Vector ; @@ -1438,6 +2048,56 @@ define <8 x i32> @avx2_psra_d_var(<8 x i32> %v, <4 x i32> %a) { ret <8 x i32> %2 } +define <2 x i64> @avx512_psra_q_128_var(<2 x i64> %v, <2 x i64> %a) { +; CHECK-LABEL: @avx512_psra_q_128_var( +; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> %a) +; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = tail call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %v, <2 x i64> %1) + ret <2 x i64> %2 +} + +define <4 x i64> @avx512_psra_q_256_var(<4 x i64> %v, <2 x i64> %a) { +; CHECK-LABEL: @avx512_psra_q_256_var( +; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> %a) +; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = tail call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %v, <2 x i64> %1) + ret <4 x i64> %2 +} + +define <32 x i16> @avx512_psra_w_512_var(<32 x i16> %v, <8 x i16> %a) { +; CHECK-LABEL: @avx512_psra_w_512_var( +; CHECK-NEXT: [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <32 x i16> [[TMP1]] +; + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %v, <8 x i16> %1) + ret <32 x i16> %2 +} + +define <16 x i32> @avx512_psra_d_512_var(<16 x i32> %v, <4 x i32> %a) { +; CHECK-LABEL: @avx512_psra_d_512_var( +; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <16 x i32> [[TMP1]] +; + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %v, <4 x i32> %1) + ret <16 x i32> %2 +} + +define <8 x i64> @avx512_psra_q_512_var(<8 x i64> %v, <2 x i64> %a) { +; CHECK-LABEL: @avx512_psra_q_512_var( +; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> %a) +; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> %1) + ret <8 x i64> %2 +} + define <8 x i16> @sse2_psrl_w_var(<8 x i16> %v, <8 x i16> %a) { ; CHECK-LABEL: @sse2_psrl_w_var( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %v, <8 x i16> %a) @@ -1522,6 +2182,60 @@ define <4 x i64> @avx2_psrl_q_var(<4 x i64> %v, <2 x i64> %a) { ret <4 x i64> %2 } +define <32 x i16> @avx512_psrl_w_512_var(<32 x i16> %v, <8 x i16> %a) { +; CHECK-LABEL: @avx512_psrl_w_512_var( +; CHECK-NEXT: [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <32 x i16> [[TMP1]] +; + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> %1) + ret <32 x i16> %2 +} + +define <32 x i16> @avx512_psrl_w_512_var_bc(<32 x i16> %v, <16 x i8> %a) { +; CHECK-LABEL: @avx512_psrl_w_512_var_bc( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> %a to <8 x i16> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> [[TMP1]]) +; CHECK-NEXT: ret <32 x i16> [[TMP2]] +; + %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + %2 = bitcast <16 x i8> %1 to <8 x i16> + %3 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> %2) + ret <32 x i16> %3 +} + +define <16 x i32> @avx512_psrl_d_512_var(<16 x i32> %v, <4 x i32> %a) { +; CHECK-LABEL: @avx512_psrl_d_512_var( +; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <16 x i32> [[TMP1]] +; + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> %1) + ret <16 x i32> %2 +} + +define <16 x i32> @avx512_psrl_d_512_var_bc(<16 x i32> %v, <2 x i64> %a) { +; CHECK-LABEL: @avx512_psrl_d_512_var_bc( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> %a to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> [[TMP1]]) +; CHECK-NEXT: ret <16 x i32> [[TMP2]] +; + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = bitcast <2 x i64> %1 to <4 x i32> + %3 = tail call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %v, <4 x i32> %2) + ret <16 x i32> %3 +} + +define <8 x i64> @avx512_psrl_q_512_var(<8 x i64> %v, <2 x i64> %a) { +; CHECK-LABEL: @avx512_psrl_q_512_var( +; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> %a) +; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = tail call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %v, <2 x i64> %1) + ret <8 x i64> %2 +} + define <8 x i16> @sse2_psll_w_var(<8 x i16> %v, <8 x i16> %a) { ; CHECK-LABEL: @sse2_psll_w_var( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %v, <8 x i16> %a) @@ -1582,6 +2296,36 @@ define <4 x i64> @avx2_psll_q_var(<4 x i64> %v, <2 x i64> %a) { ret <4 x i64> %2 } +define <32 x i16> @avx512_psll_w_512_var(<32 x i16> %v, <8 x i16> %a) { +; CHECK-LABEL: @avx512_psll_w_512_var( +; CHECK-NEXT: [[TMP1:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> %a) +; CHECK-NEXT: ret <32 x i16> [[TMP1]] +; + %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + %2 = tail call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %v, <8 x i16> %1) + ret <32 x i16> %2 +} + +define <16 x i32> @avx512_psll_d_512_var(<16 x i32> %v, <4 x i32> %a) { +; CHECK-LABEL: @avx512_psll_d_512_var( +; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> %a) +; CHECK-NEXT: ret <16 x i32> [[TMP1]] +; + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> %1) + ret <16 x i32> %2 +} + +define <8 x i64> @avx512_psll_q_512_var(<8 x i64> %v, <2 x i64> %a) { +; CHECK-LABEL: @avx512_psll_q_512_var( +; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> %a) +; CHECK-NEXT: ret <8 x i64> [[TMP1]] +; + %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> + %2 = tail call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %v, <2 x i64> %1) + ret <8 x i64> %2 +} + ; ; Constant Folding ; @@ -1670,6 +2414,48 @@ define <8 x i32> @test_avx2_psra_d_8() { ret <8 x i32> %4 } +define <32 x i16> @test_avx512_psra_w_512_0(<32 x i16> %A) { +; CHECK-LABEL: @test_avx512_psra_w_512_0( +; CHECK-NEXT: ret <32 x i16> %A +; + %1 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %A, i32 0) + %2 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %1, <8 x i16> ) + %3 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %2, i32 0) + ret <32 x i16> %3 +} + +define <32 x i16> @test_avx512_psra_w_512_8(<32 x i16> %A) { +; CHECK-LABEL: @test_avx512_psra_w_512_8( +; CHECK-NEXT: ret <32 x i16> +; + %1 = bitcast <8 x i64> to <32 x i16> + %2 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %1, i32 3) + %3 = tail call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %2, <8 x i16> ) + %4 = tail call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %3, i32 2) + ret <32 x i16> %4 +} + +define <16 x i32> @test_avx512_psra_d_512_0(<16 x i32> %A) { +; CHECK-LABEL: @test_avx512_psra_d_512_0( +; CHECK-NEXT: ret <16 x i32> %A +; + %1 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %A, i32 0) + %2 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %1, <4 x i32> ) + %3 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %2, i32 0) + ret <16 x i32> %3 +} + +define <16 x i32> @test_avx512_psra_d_512_8() { +; CHECK-LABEL: @test_avx512_psra_d_512_8( +; CHECK-NEXT: ret <16 x i32> +; + %1 = bitcast <8 x i64> to <16 x i32> + %2 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %1, i32 3) + %3 = tail call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %2, <4 x i32> ) + %4 = tail call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %3, i32 2) + ret <16 x i32> %4 +} + ; ; Old Tests ; @@ -1865,6 +2651,12 @@ define <4 x i64> @test_avx2_psrl_0() { ret <4 x i64> %16 } +declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) #1 +declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) #1 +declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) #1 +declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) #1 +declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) #1 +declare <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16>, <8 x i16>) #1 declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1 declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1 declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1 @@ -1878,6 +2670,12 @@ declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1 declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1 declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1 +declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) #1 +declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) #1 +declare <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16>, i32) #1 +declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) #1 +declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) #1 +declare <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16>, <8 x i16>) #1 declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) #1 declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) #1 declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) #1 @@ -1891,12 +2689,22 @@ declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1 declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1 declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1 +declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) #1 +declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) #1 +declare <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16>, i32) #1 +declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) #1 +declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) #1 +declare <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16>, <8 x i16>) #1 +declare <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64>, i32) #1 declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) #1 declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) #1 +declare <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64>, <2 x i64>) #1 declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) #1 declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) #1 +declare <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64>, i32) #1 declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) #1 declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) #1 +declare <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64>, <2 x i64>) #1 declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) #1 declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) #1 -- 2.7.4