From 8247687e0fdde7de092b4820ebfcfd9f2c68bd79 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 9 Jul 2017 15:47:43 +0000 Subject: [PATCH] [X86][AVX512] Regenerate AVX512VL comparison tests. Show poor codegen on KNL targets as mentioned on D35179 llvm-svn: 307500 --- llvm/test/CodeGen/X86/avx512vl-vec-cmp.ll | 925 +- llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll | 50914 +++++++++++++++++++-- 2 files changed, 47149 insertions(+), 4690 deletions(-) diff --git a/llvm/test/CodeGen/X86/avx512vl-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512vl-vec-cmp.ll index e0acf2b..43b1f53 100644 --- a/llvm/test/CodeGen/X86/avx512vl-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512vl-vec-cmp.ll @@ -1,56 +1,98 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=VLX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=NoVLX define <4 x i64> @test256_1(<4 x i64> %x, <4 x i64> %y) nounwind { -; CHECK-LABEL: test256_1: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_1: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_1: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm2 +; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: retq %mask = icmp eq <4 x i64> %x, %y %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y ret <4 x i64> %max } define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind { -; CHECK-LABEL: test256_2: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 -; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_2: +; VLX: # BB#0: +; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 +; VLX-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_2: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 +; NoVLX-NEXT: retq %mask = icmp sgt <4 x i64> %x, %y %max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y ret <4 x i64> %max } define <8 x i32> @test256_3(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1) nounwind { -; CHECK-LABEL: test256_3: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpblendmd %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_3: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k1 +; VLX-NEXT: vpblendmd %ymm2, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_3: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM2 %YMM2 %ZMM2 +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k1 +; NoVLX-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: retq %mask = icmp sge <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x1, <8 x i32> %y ret <8 x i32> %max } define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind { -; CHECK-LABEL: test256_4: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k1 -; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_4: +; VLX: # BB#0: +; VLX-NEXT: vpcmpnleuq %ymm1, %ymm0, %k1 +; VLX-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_4: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm3, %ymm1, %ymm4 +; NoVLX-NEXT: vpxor %ymm3, %ymm0, %ymm0 +; NoVLX-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm0 +; NoVLX-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 +; NoVLX-NEXT: retq %mask = icmp ugt <4 x i64> %x, %y %max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y ret <4 x i64> %max } define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind { -; CHECK-LABEL: test256_5: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_5: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_5: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp eq <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -58,11 +100,21 @@ define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwin } define <8 x i32> @test256_5b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind { -; CHECK-LABEL: test256_5b: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_5b: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_5b: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpeqd %zmm0, %zmm2, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp eq <8 x i32> %y, %x %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -70,11 +122,21 @@ define <8 x i32> @test256_5b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi } define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test256_6: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_6: +; VLX: # BB#0: +; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_6: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpgtd %zmm2, %zmm0, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp sgt <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -82,11 +144,21 @@ define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun } define <8 x i32> @test256_6b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test256_6b: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_6b: +; VLX: # BB#0: +; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_6b: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpgtd %zmm2, %zmm0, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp slt <8 x i32> %y, %x %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -94,11 +166,21 @@ define <8 x i32> @test256_6b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou } define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test256_7: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_7: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_7: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpled %zmm2, %zmm0, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp sle <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -106,11 +188,21 @@ define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun } define <8 x i32> @test256_7b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test256_7b: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_7b: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_7b: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpled %zmm2, %zmm0, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp sge <8 x i32> %y, %x %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -118,11 +210,21 @@ define <8 x i32> @test256_7b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou } define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test256_8: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_8: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_8: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpleud %zmm2, %zmm0, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp ule <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -130,11 +232,21 @@ define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun } define <8 x i32> @test256_8b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test256_8b: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_8b: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_8b: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpnltud %zmm0, %zmm2, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp uge <8 x i32> %y, %x %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -142,12 +254,25 @@ define <8 x i32> @test256_8b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou } define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32> %y1) nounwind { -; CHECK-LABEL: test256_9: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 {%k1} -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_9: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; VLX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 {%k1} +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_9: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM3 %YMM3 %ZMM3 +; NoVLX-NEXT: # kill: %YMM2 %YMM2 %ZMM2 +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpeqd %zmm3, %zmm2, %k0 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: retq %mask1 = icmp eq <8 x i32> %x1, %y1 %mask0 = icmp eq <8 x i32> %x, %y %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer @@ -156,12 +281,22 @@ define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32> } define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) nounwind { -; CHECK-LABEL: test256_10: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k1 -; CHECK-NEXT: vpcmpleq %ymm2, %ymm3, %k1 {%k1} -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_10: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleq %ymm1, %ymm0, %k1 +; VLX-NEXT: vpcmpleq %ymm2, %ymm3, %k1 {%k1} +; VLX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_10: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm3 +; NoVLX-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm3 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 +; NoVLX-NEXT: vpandn %ymm3, %ymm1, %ymm1 +; NoVLX-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 +; NoVLX-NEXT: retq %mask1 = icmp sge <4 x i64> %x1, %y1 %mask0 = icmp sle <4 x i64> %x, %y %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer @@ -170,12 +305,20 @@ define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64 } define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind { -; CHECK-LABEL: test256_11: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k1 {%k1} -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_11: +; VLX: # BB#0: +; VLX-NEXT: vpcmpgtq %ymm2, %ymm1, %k1 +; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k1 {%k1} +; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_11: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm3 +; NoVLX-NEXT: vpand %ymm2, %ymm3, %ymm2 +; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: retq %mask1 = icmp sgt <4 x i64> %x1, %y1 %y = load <4 x i64>, <4 x i64>* %y.ptr, align 4 %mask0 = icmp sgt <4 x i64> %x, %y @@ -185,12 +328,25 @@ define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4 } define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind { -; CHECK-LABEL: test256_12: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1 -; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 {%k1} -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_12: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled %ymm1, %ymm2, %k1 +; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1 {%k1} +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_12: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM2 %YMM2 %ZMM2 +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpled %zmm1, %zmm2, %k0 +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpleud %zmm2, %zmm0, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: retq %mask1 = icmp sge <8 x i32> %x1, %y1 %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask0 = icmp ule <8 x i32> %x, %y @@ -200,11 +356,18 @@ define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8 } define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind { -; CHECK-LABEL: test256_13: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_13: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k1 +; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_13: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm2 +; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: retq %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0 %y = shufflevector <4 x i64> %y.0, <4 x i64> undef, <4 x i32> zeroinitializer @@ -214,11 +377,21 @@ define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind } define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind { -; CHECK-LABEL: test256_14: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled (%rdi){1to8}, %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_14: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled (%rdi){1to8}, %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_14: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpled %zmm2, %zmm0, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: retq %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0 %y = shufflevector <8 x i32> %y.0, <8 x i32> undef, <8 x i32> zeroinitializer @@ -228,12 +401,25 @@ define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind } define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind { -; CHECK-LABEL: test256_15: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1 -; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k1 {%k1} -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_15: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled %ymm1, %ymm2, %k1 +; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k1 {%k1} +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_15: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM2 %YMM2 %ZMM2 +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpled %zmm1, %zmm2, %k0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpgtd %zmm2, %zmm0, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: retq %mask1 = icmp sge <8 x i32> %x1, %y1 %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0 @@ -245,12 +431,21 @@ define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32 } define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind { -; CHECK-LABEL: test256_16: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleq %ymm1, %ymm2, %k1 -; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k1 {%k1} -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_16: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleq %ymm1, %ymm2, %k1 +; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k1 {%k1} +; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_16: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2 +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm3 +; NoVLX-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm3 +; NoVLX-NEXT: vpandn %ymm3, %ymm2, %ymm2 +; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: retq %mask1 = icmp sge <4 x i64> %x1, %y1 %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0 @@ -262,11 +457,21 @@ define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64 } define <8 x i32> @test256_17(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind { -; CHECK-LABEL: test256_17: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpneqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_17: +; VLX: # BB#0: +; VLX-NEXT: vpcmpneqd (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_17: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpneqd %zmm2, %zmm0, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp ne <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -274,11 +479,21 @@ define <8 x i32> @test256_17(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi } define <8 x i32> @test256_18(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind { -; CHECK-LABEL: test256_18: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpneqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_18: +; VLX: # BB#0: +; VLX-NEXT: vpcmpneqd (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_18: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpneqd %zmm0, %zmm2, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp ne <8 x i32> %y, %x %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -286,11 +501,21 @@ define <8 x i32> @test256_18(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi } define <8 x i32> @test256_19(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind { -; CHECK-LABEL: test256_19: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpnltud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_19: +; VLX: # BB#0: +; VLX-NEXT: vpcmpnltud (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_19: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpnltud %zmm2, %zmm0, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp uge <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -298,11 +523,21 @@ define <8 x i32> @test256_19(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi } define <8 x i32> @test256_20(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind { -; CHECK-LABEL: test256_20: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_20: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_20: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpnltud %zmm0, %zmm2, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp uge <8 x i32> %y, %x %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -310,55 +545,90 @@ define <8 x i32> @test256_20(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi } define <2 x i64> @test128_1(<2 x i64> %x, <2 x i64> %y) nounwind { -; CHECK-LABEL: test128_1: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 -; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_1: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 +; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_1: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2 +; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %mask = icmp eq <2 x i64> %x, %y %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y ret <2 x i64> %max } define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind { -; CHECK-LABEL: test128_2: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_2: +; VLX: # BB#0: +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 +; VLX-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_2: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; NoVLX-NEXT: retq %mask = icmp sgt <2 x i64> %x, %y %max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y ret <2 x i64> %max } define <4 x i32> @test128_3(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1) nounwind { -; CHECK-LABEL: test128_3: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpblendmd %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_3: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k1 +; VLX-NEXT: vpblendmd %xmm2, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_3: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; NoVLX-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; NoVLX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; NoVLX-NEXT: retq %mask = icmp sge <4 x i32> %x, %y %max = select <4 x i1> %mask, <4 x i32> %x1, <4 x i32> %y ret <4 x i32> %max } define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind { -; CHECK-LABEL: test128_4: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1 -; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_4: +; VLX: # BB#0: +; VLX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1 +; VLX-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_4: +; NoVLX: # BB#0: +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm3, %xmm1, %xmm4 +; NoVLX-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; NoVLX-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm0 +; NoVLX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; NoVLX-NEXT: retq %mask = icmp ugt <2 x i64> %x, %y %max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y ret <2 x i64> %max } define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwind { -; CHECK-LABEL: test128_5: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_5: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_5: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %yp, align 4 %mask = icmp eq <4 x i32> %x, %y %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -366,11 +636,17 @@ define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwin } define <4 x i32> @test128_5b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwind { -; CHECK-LABEL: test128_5b: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_5b: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_5b: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %yp, align 4 %mask = icmp eq <4 x i32> %y, %x %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -378,11 +654,17 @@ define <4 x i32> @test128_5b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwi } define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_6: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_6: +; VLX: # BB#0: +; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_6: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp sgt <4 x i32> %x, %y %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -390,11 +672,17 @@ define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun } define <4 x i32> @test128_6b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_6b: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_6b: +; VLX: # BB#0: +; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_6b: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp slt <4 x i32> %y, %x %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -402,11 +690,19 @@ define <4 x i32> @test128_6b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou } define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_7: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_7: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_7: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp sle <4 x i32> %x, %y %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -414,11 +710,19 @@ define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun } define <4 x i32> @test128_7b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_7b: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_7b: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_7b: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp sge <4 x i32> %y, %x %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -426,11 +730,18 @@ define <4 x i32> @test128_7b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou } define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_8: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_8: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_8: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpminud (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp ule <4 x i32> %x, %y %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -438,11 +749,19 @@ define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun } define <4 x i32> @test128_8b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_8b: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_8b: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_8b: +; NoVLX: # BB#0: +; NoVLX-NEXT: vmovdqu (%rdi), %xmm2 +; NoVLX-NEXT: vpmaxud %xmm0, %xmm2, %xmm3 +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp uge <4 x i32> %y, %x %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -450,12 +769,20 @@ define <4 x i32> @test128_8b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou } define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32> %y1) nounwind { -; CHECK-LABEL: test128_9: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 {%k1} -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_9: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; VLX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 {%k1} +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_9: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm3 +; NoVLX-NEXT: vpand %xmm2, %xmm3, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %mask1 = icmp eq <4 x i32> %x1, %y1 %mask0 = icmp eq <4 x i32> %x, %y %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer @@ -464,12 +791,22 @@ define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32> } define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) nounwind { -; CHECK-LABEL: test128_10: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k1 -; CHECK-NEXT: vpcmpleq %xmm2, %xmm3, %k1 {%k1} -; CHECK-NEXT: vpblendmq %xmm0, %xmm2, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_10: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleq %xmm1, %xmm0, %k1 +; VLX-NEXT: vpcmpleq %xmm2, %xmm3, %k1 {%k1} +; VLX-NEXT: vpblendmq %xmm0, %xmm2, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_10: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm3 +; NoVLX-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; NoVLX-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; NoVLX-NEXT: vpandn %xmm3, %xmm1, %xmm1 +; NoVLX-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: retq %mask1 = icmp sge <2 x i64> %x1, %y1 %mask0 = icmp sle <2 x i64> %x, %y %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer @@ -478,12 +815,20 @@ define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64 } define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind { -; CHECK-LABEL: test128_11: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtq %xmm2, %xmm1, %k1 -; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_11: +; VLX: # BB#0: +; VLX-NEXT: vpcmpgtq %xmm2, %xmm1, %k1 +; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k1 {%k1} +; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_11: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 +; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm3 +; NoVLX-NEXT: vpand %xmm2, %xmm3, %xmm2 +; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %mask1 = icmp sgt <2 x i64> %x1, %y1 %y = load <2 x i64>, <2 x i64>* %y.ptr, align 4 %mask0 = icmp sgt <2 x i64> %x, %y @@ -493,12 +838,21 @@ define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2 } define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind { -; CHECK-LABEL: test128_12: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1 -; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_12: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled %xmm1, %xmm2, %k1 +; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1 {%k1} +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_12: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2 +; NoVLX-NEXT: vpminud (%rdi), %xmm0, %xmm3 +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3 +; NoVLX-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %mask1 = icmp sge <4 x i32> %x1, %y1 %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask0 = icmp ule <4 x i32> %x, %y @@ -508,11 +862,18 @@ define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4 } define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind { -; CHECK-LABEL: test128_13: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k1 -; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_13: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k1 +; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_13: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm2 +; NoVLX-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm2 +; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0 %y = insertelement <2 x i64> %y.0, i64 %yb, i32 1 @@ -522,11 +883,20 @@ define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind } define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind { -; CHECK-LABEL: test128_14: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled (%rdi){1to4}, %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_14: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled (%rdi){1to4}, %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_14: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm2 +; NoVLX-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0 %y = shufflevector <4 x i32> %y.0, <4 x i32> undef, <4 x i32> zeroinitializer @@ -536,12 +906,21 @@ define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind } define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind { -; CHECK-LABEL: test128_15: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1 -; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_15: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled %xmm1, %xmm2, %k1 +; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k1 {%k1} +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_15: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2 +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm3 +; NoVLX-NEXT: vpcmpgtd %xmm3, %xmm0, %xmm3 +; NoVLX-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %mask1 = icmp sge <4 x i32> %x1, %y1 %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0 @@ -553,12 +932,21 @@ define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32 } define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind { -; CHECK-LABEL: test128_16: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleq %xmm1, %xmm2, %k1 -; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_16: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleq %xmm1, %xmm2, %k1 +; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k1 {%k1} +; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_16: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2 +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm3 +; NoVLX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 +; NoVLX-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %mask1 = icmp sge <2 x i64> %x1, %y1 %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0 @@ -570,11 +958,19 @@ define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64 } define <4 x i32> @test128_17(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_17: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpneqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_17: +; VLX: # BB#0: +; VLX-NEXT: vpcmpneqd (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_17: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp ne <4 x i32> %x, %y %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -582,11 +978,19 @@ define <4 x i32> @test128_17(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou } define <4 x i32> @test128_18(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_18: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpneqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_18: +; VLX: # BB#0: +; VLX-NEXT: vpcmpneqd (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_18: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp ne <4 x i32> %y, %x %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -594,11 +998,18 @@ define <4 x i32> @test128_18(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou } define <4 x i32> @test128_19(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_19: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpnltud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_19: +; VLX: # BB#0: +; VLX-NEXT: vpcmpnltud (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_19: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpmaxud (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp uge <4 x i32> %x, %y %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -606,11 +1017,19 @@ define <4 x i32> @test128_19(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou } define <4 x i32> @test128_20(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_20: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_20: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_20: +; NoVLX: # BB#0: +; NoVLX-NEXT: vmovdqu (%rdi), %xmm2 +; NoVLX-NEXT: vpmaxud %xmm0, %xmm2, %xmm3 +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp uge <4 x i32> %y, %x %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 diff --git a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll index f297fc3..4d3a149 100644 --- a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -1,13 +1,124 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -check-prefix=NoVLX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=VLX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=NoVLX define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqb_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi0: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi2: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi3: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi4: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi5: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi6: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi7: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -18,11 +129,122 @@ entry: } define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqb (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqb (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi8: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi9: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi10: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi11: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi12: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi13: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi14: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi15: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -34,12 +256,124 @@ entry: } define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi16: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi17: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi18: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi19: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi20: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi21: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi22: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi23: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -52,12 +386,124 @@ entry: } define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi24: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi25: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi26: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi27: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi28: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi29: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi30: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi31: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -72,11 +518,127 @@ entry: define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqb_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi32: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi33: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi34: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi35: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi36: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi37: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi38: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi39: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -87,11 +649,127 @@ entry: } define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqb (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqb (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi40: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi41: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi42: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi43: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi44: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi45: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi46: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi47: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -103,12 +781,129 @@ entry: } define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi48: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi49: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi50: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi51: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi52: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi53: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi54: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi55: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -121,12 +916,129 @@ entry: } define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi56: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi57: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi58: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi59: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi60: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi61: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi62: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi63: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -141,12 +1053,46 @@ entry: define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqb_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi64: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi65: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi66: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %1 = bitcast <4 x i64> %__b to <32 x i8> @@ -157,12 +1103,46 @@ entry: } define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqb (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqb (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi67: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi68: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi69: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqb (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %load = load <4 x i64>, <4 x i64>* %__b @@ -174,13 +1154,56 @@ entry: } define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi70: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi71: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi72: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 +; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %1 = bitcast <4 x i64> %__b to <32 x i8> @@ -193,13 +1216,56 @@ entry: } define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqb (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqb (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi73: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi74: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi75: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; NoVLX-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %load = load <4 x i64>, <4 x i64>* %__b @@ -214,11 +1280,24 @@ entry: define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -229,11 +1308,24 @@ entry: } define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -245,12 +1337,26 @@ entry: } define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -263,12 +1369,26 @@ entry: } define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -283,11 +1403,72 @@ entry: define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi76: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi77: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi78: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -298,11 +1479,72 @@ entry: } define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi79: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi80: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi81: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -314,12 +1556,74 @@ entry: } define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi82: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi83: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi84: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -332,12 +1636,74 @@ entry: } define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi85: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi86: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi87: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -352,11 +1718,77 @@ entry: define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi88: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi89: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi90: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -367,11 +1799,77 @@ entry: } define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi91: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi92: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi93: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -383,12 +1881,79 @@ entry: } define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi94: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi95: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi96: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -401,12 +1966,79 @@ entry: } define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi97: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi98: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi99: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -421,12 +2053,123 @@ entry: define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi100: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi101: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi102: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi103: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi104: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi105: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi106: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi107: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -437,12 +2180,123 @@ entry: } define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi108: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi109: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi110: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi111: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi112: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi113: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi114: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi115: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -454,13 +2308,125 @@ entry: } define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi116: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi117: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi118: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi119: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi120: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi121: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi122: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi123: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -473,13 +2439,125 @@ entry: } define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi124: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi125: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi126: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi127: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi128: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi129: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi130: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi131: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -494,12 +2572,128 @@ entry: define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi132: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi133: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi134: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi135: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi136: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi137: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi138: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi139: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -510,12 +2704,128 @@ entry: } define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi140: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi141: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi142: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi143: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi144: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi145: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi146: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi147: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -527,13 +2837,130 @@ entry: } define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi148: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi149: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi150: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi151: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi152: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi153: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi154: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi155: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -546,13 +2973,130 @@ entry: } define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi156: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi157: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi158: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi159: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi160: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi161: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi162: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi163: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -567,12 +3111,348 @@ entry: define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi164: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi165: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi166: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %1 = bitcast <8 x i64> %__b to <32 x i16> @@ -583,12 +3463,263 @@ entry: } define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi167: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi168: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi169: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpeqw 32(%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %load = load <8 x i64>, <8 x i64>* %__b @@ -600,13 +3731,358 @@ entry: } define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi170: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi171: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi172: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} +; NoVLX-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm6, %xmm6 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm8, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3 +; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpand %xmm6, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %1 = bitcast <8 x i64> %__b to <32 x i16> @@ -619,13 +4095,273 @@ entry: } define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi173: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi174: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi175: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm2 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpcmpeqw 32(%rsi), %ymm4, %ymm4 +; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4 +; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4 +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %load = load <8 x i64>, <8 x i64>* %__b @@ -640,11 +4376,51 @@ entry: define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -655,11 +4431,51 @@ entry: } define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -671,12 +4487,70 @@ entry: } define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -690,12 +4564,70 @@ entry: } define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -711,11 +4643,52 @@ entry: define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -728,12 +4701,71 @@ entry: } define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -750,11 +4782,50 @@ entry: define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -765,11 +4836,50 @@ entry: } define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -781,12 +4891,69 @@ entry: } define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -800,12 +4967,69 @@ entry: } define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -821,11 +5045,51 @@ entry: define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -838,12 +5102,70 @@ entry: } define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -860,11 +5182,39 @@ entry: define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi176: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi177: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi178: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -875,11 +5225,39 @@ entry: } define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi179: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi180: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi181: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -891,12 +5269,58 @@ entry: } define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi182: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi183: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi184: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -910,12 +5334,58 @@ entry: } define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi185: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi186: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi187: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -931,11 +5401,40 @@ entry: define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi188: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi189: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi190: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -948,12 +5447,59 @@ entry: } define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi191: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi192: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi193: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -970,11 +5516,46 @@ entry: define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi194: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi195: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi196: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -985,11 +5566,46 @@ entry: } define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi197: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi198: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi199: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -1001,12 +5617,65 @@ entry: } define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi200: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi201: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi202: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -1020,12 +5689,65 @@ entry: } define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi203: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi204: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi205: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -1041,11 +5763,47 @@ entry: define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi206: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi207: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi208: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -1058,12 +5816,66 @@ entry: } define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi209: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi210: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi211: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -1080,21 +5892,23 @@ entry: define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -1106,21 +5920,23 @@ entry: } define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -1133,23 +5949,25 @@ entry: } define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -1163,23 +5981,25 @@ entry: } define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -1195,21 +6015,23 @@ entry: define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -1223,23 +6045,25 @@ entry: } define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -1256,12 +6080,72 @@ entry: define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi212: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi213: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi214: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -1272,12 +6156,72 @@ entry: } define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi215: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi216: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi217: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -1289,13 +6233,75 @@ entry: } define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi218: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi219: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi220: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -1308,13 +6314,75 @@ entry: } define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi221: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi222: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi223: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -1329,12 +6397,72 @@ entry: define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi224: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi225: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi226: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -1347,13 +6475,75 @@ entry: } define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi227: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi228: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi229: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -1369,12 +6559,77 @@ entry: define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi230: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi231: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi232: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -1385,12 +6640,77 @@ entry: } define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi233: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi234: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi235: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -1402,13 +6722,80 @@ entry: } define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi236: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi237: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi238: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -1421,13 +6808,80 @@ entry: } define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi239: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi240: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi241: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -1442,12 +6896,77 @@ entry: define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi242: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi243: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi244: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -1460,13 +6979,80 @@ entry: } define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi245: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi246: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi247: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -1482,12 +7068,120 @@ entry: define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi248: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi249: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi250: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi251: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi252: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi253: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi254: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi255: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -1498,12 +7192,120 @@ entry: } define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi256: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi257: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi258: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi259: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi260: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi261: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi262: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi263: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -1515,13 +7317,122 @@ entry: } define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi264: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi265: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi266: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi267: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi268: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi269: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi270: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi271: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -1534,13 +7445,122 @@ entry: } define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi272: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi273: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi274: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi275: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi276: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi277: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi278: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi279: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -1555,12 +7575,120 @@ entry: define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi280: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi281: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi282: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi283: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi284: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi285: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi286: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi287: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -1573,13 +7701,122 @@ entry: } define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi288: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi289: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi290: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi291: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi292: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi293: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi294: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi295: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -1595,12 +7832,125 @@ entry: define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi296: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi297: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi298: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi299: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi300: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi301: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi302: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi303: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -1611,12 +7961,125 @@ entry: } define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi304: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi305: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi306: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi307: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi308: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi309: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi310: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi311: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -1628,13 +8091,127 @@ entry: } define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi312: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi313: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi314: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi315: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi316: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi317: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi318: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi319: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -1647,13 +8224,127 @@ entry: } define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi320: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi321: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi322: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi323: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi324: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi325: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi326: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi327: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -1668,12 +8359,125 @@ entry: define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi328: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi329: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi330: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi331: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi332: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi333: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi334: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi335: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -1686,13 +8490,127 @@ entry: } define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi336: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi337: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi338: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi339: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi340: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi341: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi342: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi343: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -1708,12 +8626,23 @@ entry: define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -1724,12 +8653,23 @@ entry: } define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -1741,13 +8681,34 @@ entry: } define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -1761,13 +8722,34 @@ entry: } define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -1783,12 +8765,24 @@ entry: define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -1801,13 +8795,35 @@ entry: } define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -1824,11 +8840,35 @@ entry: define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -1839,11 +8879,35 @@ entry: } define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -1855,12 +8919,46 @@ entry: } define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -1874,12 +8972,46 @@ entry: } define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -1895,11 +9027,36 @@ entry: define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -1912,12 +9069,47 @@ entry: } define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -1934,11 +9126,34 @@ entry: define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -1949,11 +9164,34 @@ entry: } define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -1965,12 +9203,45 @@ entry: } define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -1984,12 +9255,45 @@ entry: } define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -2005,11 +9309,35 @@ entry: define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -2022,12 +9350,46 @@ entry: } define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -2044,11 +9406,39 @@ entry: define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi344: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi345: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi346: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -2059,11 +9449,39 @@ entry: } define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi347: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi348: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi349: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -2075,12 +9493,50 @@ entry: } define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi350: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi351: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi352: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -2094,12 +9550,50 @@ entry: } define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi353: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi354: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi355: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -2115,11 +9609,40 @@ entry: define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi356: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi357: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi358: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -2132,12 +9655,51 @@ entry: } define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi359: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi360: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi361: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -2154,11 +9716,46 @@ entry: define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi362: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi363: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi364: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -2169,11 +9766,46 @@ entry: } define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi365: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi366: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi367: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -2185,12 +9817,57 @@ entry: } define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi368: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi369: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi370: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -2204,12 +9881,57 @@ entry: } define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi371: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi372: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi373: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -2225,11 +9947,47 @@ entry: define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi374: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi375: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi376: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -2242,12 +10000,58 @@ entry: } define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi377: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi378: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi379: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -2264,12 +10068,53 @@ entry: define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -2280,12 +10125,53 @@ entry: } define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -2297,13 +10183,72 @@ entry: } define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -2317,13 +10262,72 @@ entry: } define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -2339,12 +10343,54 @@ entry: define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -2357,13 +10403,73 @@ entry: } define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -2380,12 +10486,52 @@ entry: define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -2396,12 +10542,52 @@ entry: } define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -2413,13 +10599,71 @@ entry: } define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -2433,13 +10677,71 @@ entry: } define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -2455,12 +10757,53 @@ entry: define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -2473,13 +10816,72 @@ entry: } define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -2496,12 +10898,41 @@ entry: define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi380: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi381: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi382: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -2512,12 +10943,41 @@ entry: } define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi383: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi384: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi385: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -2529,13 +10989,60 @@ entry: } define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi386: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi387: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi388: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -2549,13 +11056,60 @@ entry: } define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi389: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi390: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi391: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -2571,12 +11125,42 @@ entry: define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi392: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi393: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi394: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -2589,13 +11173,61 @@ entry: } define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi395: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi396: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi397: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -2612,12 +11244,48 @@ entry: define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi398: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi399: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi400: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -2628,12 +11296,48 @@ entry: } define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi401: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi402: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi403: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -2645,13 +11349,67 @@ entry: } define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi404: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi405: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi406: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -2665,13 +11423,67 @@ entry: } define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi407: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi408: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi409: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -2687,12 +11499,49 @@ entry: define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi410: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi411: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi412: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -2705,13 +11554,68 @@ entry: } define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi413: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi414: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi415: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -2728,12 +11632,20 @@ entry: define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -2744,12 +11656,20 @@ entry: } define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -2761,13 +11681,22 @@ entry: } define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -2780,13 +11709,22 @@ entry: } define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -2801,12 +11739,20 @@ entry: define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -2819,13 +11765,22 @@ entry: } define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -2841,12 +11796,70 @@ entry: define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi416: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi417: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi418: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -2857,12 +11870,70 @@ entry: } define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi419: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi420: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi421: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -2874,13 +11945,72 @@ entry: } define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi422: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi423: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi424: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -2893,13 +12023,72 @@ entry: } define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi425: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi426: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi427: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -2914,12 +12103,70 @@ entry: define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi428: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi429: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi430: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -2932,13 +12179,72 @@ entry: } define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi431: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi432: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi433: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -2954,12 +12260,75 @@ entry: define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi434: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi435: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi436: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -2970,12 +12339,75 @@ entry: } define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi437: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi438: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi439: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -2987,13 +12419,77 @@ entry: } define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi440: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi441: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi442: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -3006,13 +12502,77 @@ entry: } define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi443: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi444: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi445: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -3027,12 +12587,75 @@ entry: define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi446: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi447: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi448: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -3045,13 +12668,77 @@ entry: } define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi449: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi450: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi451: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -3067,11 +12754,122 @@ entry: define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi452: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi453: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi454: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi455: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi456: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi457: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi458: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi459: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -3082,11 +12880,122 @@ entry: } define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtb (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi460: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi461: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi462: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi463: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi464: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi465: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi466: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi467: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -3098,12 +13007,124 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi468: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi469: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi470: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi471: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi472: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi473: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi474: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi475: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -3116,12 +13137,124 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi476: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi477: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi478: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi479: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi480: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi481: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi482: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi483: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -3136,11 +13269,127 @@ entry: define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi484: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi485: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi486: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi487: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi488: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi489: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi490: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi491: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -3151,11 +13400,127 @@ entry: } define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtb (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi492: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi493: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi494: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi495: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi496: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi497: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi498: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi499: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -3167,12 +13532,129 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi500: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi501: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi502: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi503: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi504: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi505: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi506: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi507: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -3185,12 +13667,129 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi508: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi509: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi510: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi511: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi512: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi513: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi514: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi515: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -3205,12 +13804,46 @@ entry: define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi516: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi517: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi518: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %1 = bitcast <4 x i64> %__b to <32 x i8> @@ -3221,12 +13854,46 @@ entry: } define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtb (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtb (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi519: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi520: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi521: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtb (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %load = load <4 x i64>, <4 x i64>* %__b @@ -3238,13 +13905,56 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi522: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi523: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi524: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 +; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; NoVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %1 = bitcast <4 x i64> %__b to <32 x i8> @@ -3257,13 +13967,56 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtb (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtb (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi525: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi526: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi527: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; NoVLX-NEXT: vpcmpgtb (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %load = load <4 x i64>, <4 x i64>* %__b @@ -3278,11 +14031,24 @@ entry: define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -3293,11 +14059,24 @@ entry: } define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -3309,12 +14088,26 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -3327,12 +14120,26 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -3347,11 +14154,72 @@ entry: define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi528: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi529: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi530: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -3362,11 +14230,72 @@ entry: } define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi531: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi532: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi533: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -3378,12 +14307,74 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi534: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi535: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi536: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -3396,12 +14387,74 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi537: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi538: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi539: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -3416,11 +14469,77 @@ entry: define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi540: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi541: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi542: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -3431,11 +14550,77 @@ entry: } define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi543: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi544: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi545: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -3447,12 +14632,79 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi546: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi547: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi548: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -3465,12 +14717,79 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi549: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi550: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi551: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -3485,12 +14804,123 @@ entry: define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi552: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi553: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi554: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi555: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi556: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi557: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi558: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi559: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -3501,12 +14931,123 @@ entry: } define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi560: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi561: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi562: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi563: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi564: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi565: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi566: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi567: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -3518,13 +15059,125 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi568: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi569: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi570: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi571: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi572: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi573: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi574: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi575: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -3537,13 +15190,125 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi576: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi577: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi578: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi579: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi580: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi581: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi582: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi583: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -3558,12 +15323,128 @@ entry: define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi584: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi585: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi586: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi587: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi588: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi589: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi590: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi591: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -3574,12 +15455,128 @@ entry: } define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi592: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi593: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi594: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi595: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi596: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi597: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi598: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi599: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -3591,13 +15588,130 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi600: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi601: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi602: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi603: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi604: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi605: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi606: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi607: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -3610,13 +15724,130 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi608: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi609: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi610: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi611: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi612: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi613: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi614: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi615: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -3631,12 +15862,348 @@ entry: define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi616: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi617: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi618: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %1 = bitcast <8 x i64> %__b to <32 x i16> @@ -3647,12 +16214,263 @@ entry: } define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi619: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi620: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi621: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw 32(%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %load = load <8 x i64>, <8 x i64>* %__b @@ -3664,13 +16482,358 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi622: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi623: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi624: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm6, %xmm6 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm8, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3 +; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpand %xmm6, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %1 = bitcast <8 x i64> %__b to <32 x i16> @@ -3683,13 +16846,273 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi625: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi626: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi627: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm2 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpcmpgtw 32(%rsi), %ymm4, %ymm4 +; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4 +; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4 +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %load = load <8 x i64>, <8 x i64>* %__b @@ -3704,11 +17127,51 @@ entry: define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -3719,11 +17182,51 @@ entry: } define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -3735,12 +17238,70 @@ entry: } define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -3754,12 +17315,70 @@ entry: } define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -3775,11 +17394,52 @@ entry: define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -3792,12 +17452,71 @@ entry: } define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -3814,11 +17533,50 @@ entry: define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -3829,11 +17587,50 @@ entry: } define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -3845,12 +17642,69 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -3864,12 +17718,69 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -3885,11 +17796,51 @@ entry: define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -3902,12 +17853,70 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -3924,11 +17933,39 @@ entry: define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi628: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi629: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi630: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -3939,11 +17976,39 @@ entry: } define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi631: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi632: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi633: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -3955,12 +18020,58 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi634: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi635: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi636: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -3974,12 +18085,58 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi637: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi638: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi639: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -3995,11 +18152,40 @@ entry: define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi640: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi641: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi642: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -4012,12 +18198,59 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi643: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi644: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi645: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -4034,11 +18267,46 @@ entry: define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi646: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi647: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi648: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -4049,11 +18317,46 @@ entry: } define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi649: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi650: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi651: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -4065,12 +18368,65 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi652: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi653: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi654: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -4084,12 +18440,65 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi655: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi656: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi657: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -4105,11 +18514,47 @@ entry: define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi658: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi659: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi660: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -4122,12 +18567,66 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi661: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi662: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi663: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -4144,21 +18643,23 @@ entry: define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -4170,21 +18671,23 @@ entry: } define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -4197,23 +18700,25 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -4227,23 +18732,25 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -4259,21 +18766,23 @@ entry: define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -4287,23 +18796,25 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -4320,28 +18831,148 @@ entry: define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -entry: - %0 = bitcast <4 x i64> %__a to <8 x i32> - %1 = bitcast <4 x i64> %__b to <8 x i32> - %2 = icmp sgt <8 x i32> %0, %1 +; VLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi664: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi665: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi666: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> %4 = bitcast <32 x i1> %3 to i32 ret i32 %4 } define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi667: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi668: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi669: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -4353,13 +18984,75 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi670: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi671: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi672: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -4372,13 +19065,75 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi673: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi674: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi675: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -4393,12 +19148,72 @@ entry: define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi676: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi677: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi678: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -4411,13 +19226,75 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi679: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi680: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi681: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -4433,12 +19310,77 @@ entry: define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi682: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi683: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi684: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -4449,12 +19391,77 @@ entry: } define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi685: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi686: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi687: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -4466,13 +19473,80 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi688: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi689: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi690: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -4485,13 +19559,80 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi691: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi692: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi693: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -4506,12 +19647,77 @@ entry: define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi694: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi695: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi696: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -4524,13 +19730,80 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi697: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi698: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi699: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -4546,12 +19819,120 @@ entry: define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi700: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi701: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi702: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi703: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi704: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi705: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi706: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi707: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -4562,12 +19943,120 @@ entry: } define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi708: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi709: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi710: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi711: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi712: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi713: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi714: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi715: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -4579,13 +20068,122 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi716: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi717: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi718: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi719: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi720: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi721: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi722: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi723: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -4598,13 +20196,122 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi724: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi725: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi726: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi727: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi728: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi729: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi730: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi731: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -4619,12 +20326,120 @@ entry: define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi732: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi733: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi734: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi735: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi736: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi737: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi738: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi739: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -4637,13 +20452,122 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi740: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi741: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi742: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi743: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi744: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi745: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi746: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi747: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -4659,12 +20583,125 @@ entry: define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi748: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi749: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi750: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi751: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi752: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi753: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi754: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi755: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -4675,12 +20712,125 @@ entry: } define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi756: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi757: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi758: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi759: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi760: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi761: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi762: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi763: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -4692,13 +20842,127 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi764: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi765: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi766: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi767: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi768: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi769: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi770: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi771: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -4711,13 +20975,127 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi772: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi773: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi774: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi775: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi776: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi777: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi778: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi779: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -4732,12 +21110,125 @@ entry: define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi780: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi781: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi782: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi783: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi784: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi785: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi786: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi787: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -4750,13 +21241,127 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi788: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi789: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi790: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi791: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi792: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi793: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi794: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi795: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -4772,12 +21377,23 @@ entry: define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -4788,12 +21404,23 @@ entry: } define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -4805,13 +21432,34 @@ entry: } define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -4825,13 +21473,34 @@ entry: } define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -4847,12 +21516,24 @@ entry: define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -4865,13 +21546,35 @@ entry: } define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -4888,11 +21591,35 @@ entry: define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -4903,11 +21630,35 @@ entry: } define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -4919,12 +21670,46 @@ entry: } define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -4938,12 +21723,46 @@ entry: } define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -4959,11 +21778,36 @@ entry: define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -4976,12 +21820,47 @@ entry: } define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -4998,11 +21877,34 @@ entry: define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -5013,11 +21915,34 @@ entry: } define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -5029,12 +21954,45 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -5048,12 +22006,45 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -5069,11 +22060,35 @@ entry: define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -5086,12 +22101,46 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -5108,11 +22157,39 @@ entry: define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi796: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi797: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi798: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -5123,11 +22200,39 @@ entry: } define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi799: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi800: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi801: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -5139,12 +22244,50 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi802: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi803: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi804: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -5158,12 +22301,50 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi805: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi806: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi807: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -5179,11 +22360,40 @@ entry: define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi808: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi809: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi810: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -5196,12 +22406,51 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi811: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi812: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi813: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -5218,11 +22467,46 @@ entry: define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi814: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi815: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi816: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -5233,11 +22517,46 @@ entry: } define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi817: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi818: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi819: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -5249,12 +22568,57 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi820: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi821: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi822: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -5268,12 +22632,57 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi823: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi824: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi825: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -5289,11 +22698,47 @@ entry: define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi826: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi827: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi828: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -5306,12 +22751,58 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi829: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi830: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi831: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -5328,12 +22819,53 @@ entry: define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -5344,12 +22876,53 @@ entry: } define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -5361,13 +22934,72 @@ entry: } define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -5381,13 +23013,72 @@ entry: } define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -5403,12 +23094,54 @@ entry: define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -5421,13 +23154,73 @@ entry: } define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -5444,12 +23237,52 @@ entry: define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -5460,12 +23293,52 @@ entry: } define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -5477,13 +23350,71 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -5497,13 +23428,71 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -5519,12 +23508,53 @@ entry: define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -5537,13 +23567,72 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -5560,12 +23649,41 @@ entry: define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi832: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi833: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi834: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -5576,12 +23694,41 @@ entry: } define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi835: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi836: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi837: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -5593,13 +23740,60 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi838: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi839: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi840: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -5613,13 +23807,60 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi841: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi842: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi843: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -5635,12 +23876,42 @@ entry: define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi844: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi845: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi846: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -5653,13 +23924,61 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi847: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi848: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi849: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -5676,12 +23995,48 @@ entry: define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi850: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi851: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi852: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -5692,12 +24047,48 @@ entry: } define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi853: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi854: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi855: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -5709,13 +24100,67 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi856: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi857: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi858: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -5729,13 +24174,67 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi859: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi860: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi861: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -5751,12 +24250,49 @@ entry: define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi862: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi863: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi864: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -5769,13 +24305,68 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi865: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi866: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi867: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -5792,12 +24383,20 @@ entry: define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -5808,12 +24407,20 @@ entry: } define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -5825,13 +24432,22 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -5844,13 +24460,22 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -5865,12 +24490,20 @@ entry: define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -5883,13 +24516,22 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -5905,12 +24547,70 @@ entry: define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi868: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi869: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi870: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -5921,12 +24621,70 @@ entry: } define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi871: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi872: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi873: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -5938,13 +24696,72 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi874: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi875: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi876: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -5957,13 +24774,72 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi877: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi878: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi879: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -5978,12 +24854,70 @@ entry: define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi880: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi881: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi882: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -5996,13 +24930,72 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi883: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi884: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi885: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -6018,12 +25011,75 @@ entry: define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi886: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi887: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi888: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -6034,12 +25090,75 @@ entry: } define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi889: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi890: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi891: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -6051,13 +25170,77 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi892: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi893: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi894: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -6070,13 +25253,77 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi895: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi896: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi897: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -6091,12 +25338,75 @@ entry: define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi898: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi899: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi900: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -6109,13 +25419,77 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi901: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi902: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi903: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -6131,11 +25505,124 @@ entry: define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi904: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi905: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi906: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi907: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi908: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi909: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi910: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi911: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -6146,11 +25633,125 @@ entry: } define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltb (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltb (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi912: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi913: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi914: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi915: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi916: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi917: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi918: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi919: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -6162,12 +25763,126 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi920: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi921: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi922: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi923: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi924: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi925: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi926: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi927: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -6180,12 +25895,127 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi928: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi929: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi930: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi931: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi932: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi933: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi934: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi935: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -6200,11 +26030,129 @@ entry: define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi936: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi937: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi938: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi939: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi940: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi941: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi942: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi943: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -6215,11 +26163,130 @@ entry: } define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltb (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltb (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi944: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi945: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi946: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi947: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi948: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi949: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi950: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi951: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -6231,12 +26298,131 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi952: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi953: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi954: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi955: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi956: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi957: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi958: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi959: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -6249,12 +26435,132 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi960: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi961: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi962: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi963: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi964: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi965: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi966: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi967: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -6269,12 +26575,48 @@ entry: define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleb %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi968: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi969: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi970: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %1 = bitcast <4 x i64> %__b to <32 x i8> @@ -6285,12 +26627,49 @@ entry: } define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltb (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltb (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi971: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi972: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi973: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %load = load <4 x i64>, <4 x i64>* %__b @@ -6302,13 +26681,58 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleb %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi974: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi975: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi976: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 +; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %1 = bitcast <4 x i64> %__b to <32 x i8> @@ -6321,13 +26745,59 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltb (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltb (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi977: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi978: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi979: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm4 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm4, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; NoVLX-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %load = load <4 x i64>, <4 x i64>* %__b @@ -6342,11 +26812,26 @@ entry: define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -6357,11 +26842,27 @@ entry: } define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -6373,12 +26874,28 @@ entry: } define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -6391,12 +26908,29 @@ entry: } define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -6411,11 +26945,74 @@ entry: define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi980: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi981: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi982: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -6426,11 +27023,75 @@ entry: } define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi983: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi984: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi985: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -6442,12 +27103,76 @@ entry: } define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi986: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi987: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi988: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -6460,12 +27185,77 @@ entry: } define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi989: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi990: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi991: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -6480,11 +27270,79 @@ entry: define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi992: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi993: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi994: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -6495,11 +27353,80 @@ entry: } define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi995: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi996: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi997: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -6511,12 +27438,81 @@ entry: } define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi998: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi999: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1000: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -6529,12 +27525,82 @@ entry: } define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1001: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1002: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1003: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -6549,12 +27615,125 @@ entry: define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1004: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1005: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1006: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1007: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1008: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1009: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1010: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1011: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -6565,12 +27744,126 @@ entry: } define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltw (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltw (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1012: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1013: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1014: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1015: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1016: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1017: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1018: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1019: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -6582,13 +27875,127 @@ entry: } define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1020: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1021: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1022: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1023: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1024: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1025: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1026: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1027: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -6601,13 +28008,128 @@ entry: } define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1028: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1029: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1030: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1031: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1032: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1033: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1034: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1035: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -6622,12 +28144,130 @@ entry: define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1036: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1037: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1038: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1039: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1040: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1041: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1042: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1043: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -6638,12 +28278,131 @@ entry: } define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltw (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltw (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1044: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1045: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1046: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1047: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1048: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1049: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1050: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1051: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -6655,13 +28414,132 @@ entry: } define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1052: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1053: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1054: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1055: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1056: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1057: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1058: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1059: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -6674,13 +28552,133 @@ entry: } define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1060: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1061: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1062: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1063: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1064: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1065: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1066: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1067: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -6695,12 +28693,351 @@ entry: define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmplew %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1068: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1069: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1070: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm2 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %1 = bitcast <8 x i64> %__b to <32 x i16> @@ -6711,12 +29048,268 @@ entry: } define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltw (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltw (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1071: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1072: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1073: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 +; NoVLX-NEXT: vmovdqa 32(%rdi), %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %load = load <8 x i64>, <8 x i64>* %__b @@ -6728,13 +29321,361 @@ entry: } define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmplew %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1074: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1075: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1076: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm6, %xmm6 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm8, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 +; NoVLX-NEXT: vpxor %ymm5, %ymm2, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3 +; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpand %xmm6, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %1 = bitcast <8 x i64> %__b to <32 x i16> @@ -6747,13 +29688,278 @@ entry: } define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltw (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltw (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1077: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1078: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1079: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm2 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm5 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm5, %ymm3 +; NoVLX-NEXT: vmovdqa 32(%rsi), %ymm5 +; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm5, %ymm4 +; NoVLX-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 +; NoVLX-NEXT: vpxor %ymm5, %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm4 +; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4 +; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4 +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %load = load <8 x i64>, <8 x i64>* %__b @@ -6768,11 +29974,53 @@ entry: define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -6783,11 +30031,54 @@ entry: } define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -6799,12 +30090,70 @@ entry: } define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -6818,12 +30167,71 @@ entry: } define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -6839,12 +30247,55 @@ entry: define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -6857,13 +30308,72 @@ entry: } define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -6880,11 +30390,52 @@ entry: define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -6895,11 +30446,53 @@ entry: } define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -6911,12 +30504,69 @@ entry: } define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -6930,12 +30580,70 @@ entry: } define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -6951,12 +30659,54 @@ entry: define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -6969,13 +30719,71 @@ entry: } define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -6992,11 +30800,41 @@ entry: define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1080: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1081: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1082: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -7007,11 +30845,42 @@ entry: } define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1083: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1084: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1085: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -7023,12 +30892,58 @@ entry: } define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1086: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1087: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1088: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -7042,12 +30957,59 @@ entry: } define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1089: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1090: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1091: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -7063,12 +31025,43 @@ entry: define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1092: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1093: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1094: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -7081,13 +31074,60 @@ entry: } define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1095: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1096: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1097: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -7104,11 +31144,48 @@ entry: define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1098: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1099: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1100: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -7119,11 +31196,49 @@ entry: } define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1101: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1102: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1103: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -7135,12 +31250,65 @@ entry: } define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1104: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1105: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1106: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -7154,12 +31322,66 @@ entry: } define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1107: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1108: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1109: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -7175,12 +31397,50 @@ entry: define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1110: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1111: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1112: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -7193,13 +31453,67 @@ entry: } define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1113: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1114: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1115: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -7216,21 +31530,23 @@ entry: define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -7242,21 +31558,23 @@ entry: } define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -7269,23 +31587,25 @@ entry: } define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -7299,23 +31619,25 @@ entry: } define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -7331,22 +31653,24 @@ entry: define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rdi), %ymm1 -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem_b: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -7360,24 +31684,26 @@ entry: } define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rsi), %ymm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -7394,12 +31720,72 @@ entry: define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1116: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1117: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1118: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -7410,12 +31796,72 @@ entry: } define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1119: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1120: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1121: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -7427,13 +31873,75 @@ entry: } define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1122: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1123: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1124: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -7446,13 +31954,75 @@ entry: } define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1125: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1126: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1127: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -7467,13 +32037,73 @@ entry: define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rdi), %ymm1 -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1128: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1129: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1130: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -7486,14 +32116,76 @@ entry: } define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rsi), %ymm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1131: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1132: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1133: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -7509,12 +32201,77 @@ entry: define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1134: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1135: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1136: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -7525,12 +32282,77 @@ entry: } define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1137: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1138: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1139: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -7542,13 +32364,80 @@ entry: } define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1140: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1141: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1142: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -7561,13 +32450,80 @@ entry: } define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1143: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1144: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1145: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -7582,13 +32538,78 @@ entry: define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rdi), %ymm1 -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1146: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1147: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1148: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -7601,14 +32622,81 @@ entry: } define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rsi), %ymm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1149: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1150: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1151: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -7624,12 +32712,120 @@ entry: define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1152: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1153: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1154: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1155: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1156: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1157: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1158: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1159: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -7640,12 +32836,120 @@ entry: } define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1160: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1161: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1162: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1163: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1164: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1165: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1166: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1167: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -7657,13 +32961,122 @@ entry: } define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1168: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1169: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1170: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1171: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1172: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1173: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1174: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1175: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -7676,13 +33089,122 @@ entry: } define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1176: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1177: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1178: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1179: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1180: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1181: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1182: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1183: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -7697,13 +33219,122 @@ entry: define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm1 -; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rdi), %zmm1 +; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1184: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1185: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1186: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1187: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1188: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1189: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1190: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1191: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -7716,14 +33347,124 @@ entry: } define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rsi), %zmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %zmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1192: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1193: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1194: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1195: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1196: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1197: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1198: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1199: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -7739,12 +33480,125 @@ entry: define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1200: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1201: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1202: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1203: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1204: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1205: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1206: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1207: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -7755,12 +33609,125 @@ entry: } define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1208: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1209: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1210: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1211: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1212: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1213: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1214: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1215: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -7772,13 +33739,127 @@ entry: } define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1216: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1217: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1218: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1219: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1220: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1221: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1222: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1223: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -7791,13 +33872,127 @@ entry: } define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1224: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1225: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1226: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1227: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1228: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1229: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1230: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1231: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -7812,13 +34007,127 @@ entry: define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm1 -; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rdi), %zmm1 +; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1232: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1233: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1234: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1235: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1236: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1237: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1238: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1239: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -7831,14 +34140,129 @@ entry: } define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rsi), %zmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %zmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1240: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1241: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1242: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1243: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1244: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1245: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1246: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1247: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -7854,12 +34278,25 @@ entry: define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -7870,12 +34307,26 @@ entry: } define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -7887,13 +34338,34 @@ entry: } define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -7907,13 +34379,35 @@ entry: } define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -7929,13 +34423,27 @@ entry: define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -7948,14 +34456,36 @@ entry: } define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -7972,11 +34502,37 @@ entry: define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -7987,11 +34543,38 @@ entry: } define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -8003,12 +34586,46 @@ entry: } define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -8022,12 +34639,47 @@ entry: } define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -8043,12 +34695,39 @@ entry: define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -8061,13 +34740,48 @@ entry: } define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -8084,11 +34798,36 @@ entry: define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -8099,11 +34838,37 @@ entry: } define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -8115,12 +34880,45 @@ entry: } define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -8134,12 +34932,46 @@ entry: } define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -8155,12 +34987,38 @@ entry: define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -8173,13 +35031,47 @@ entry: } define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -8196,11 +35088,41 @@ entry: define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1248: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1249: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1250: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -8211,11 +35133,42 @@ entry: } define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1251: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1252: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1253: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -8227,12 +35180,50 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1254: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1255: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1256: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -8246,12 +35237,51 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1257: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1258: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1259: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -8267,12 +35297,43 @@ entry: define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1260: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1261: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1262: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -8285,13 +35346,52 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1263: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1264: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1265: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -8308,11 +35408,48 @@ entry: define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1266: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1267: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1268: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -8323,11 +35460,49 @@ entry: } define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1269: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1270: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1271: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -8339,12 +35514,57 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1272: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1273: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1274: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -8358,12 +35578,58 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1275: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1276: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1277: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -8379,12 +35645,50 @@ entry: define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1278: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1279: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1280: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -8397,13 +35701,59 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1281: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1282: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1283: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -8420,12 +35770,55 @@ entry: define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -8436,12 +35829,56 @@ entry: } define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -8453,13 +35890,74 @@ entry: } define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -8473,13 +35971,75 @@ entry: } define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -8495,13 +36055,57 @@ entry: define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -8514,14 +36118,76 @@ entry: } define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -8538,12 +36204,54 @@ entry: define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -8554,12 +36262,55 @@ entry: } define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -8571,13 +36322,73 @@ entry: } define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -8591,13 +36402,74 @@ entry: } define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -8613,13 +36485,56 @@ entry: define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -8632,14 +36547,75 @@ entry: } define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -8656,12 +36632,43 @@ entry: define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1284: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1285: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1286: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -8672,12 +36679,44 @@ entry: } define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1287: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1288: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1289: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -8689,13 +36728,62 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1290: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1291: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1292: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -8709,13 +36797,63 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1293: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1294: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1295: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -8731,13 +36869,45 @@ entry: define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1296: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1297: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1298: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -8750,14 +36920,64 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1299: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1300: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1301: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -8774,12 +36994,50 @@ entry: define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1302: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1303: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1304: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -8790,12 +37048,51 @@ entry: } define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1305: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1306: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1307: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -8807,13 +37104,69 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1308: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1309: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1310: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -8827,13 +37180,70 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1311: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1312: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1313: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -8849,13 +37259,52 @@ entry: define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1314: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1315: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1316: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -8868,14 +37317,71 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1317: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1318: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1319: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -8892,12 +37398,20 @@ entry: define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -8908,12 +37422,20 @@ entry: } define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -8925,13 +37447,22 @@ entry: } define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -8944,13 +37475,22 @@ entry: } define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -8965,13 +37505,22 @@ entry: define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %zmm1 -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %zmm1 +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -8984,14 +37533,24 @@ entry: } define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %zmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %zmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -9007,12 +37566,70 @@ entry: define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1320: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1321: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1322: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -9023,12 +37640,70 @@ entry: } define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1323: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1324: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1325: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -9040,13 +37715,72 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1326: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1327: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1328: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -9059,13 +37793,72 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1329: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1330: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1331: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -9080,13 +37873,72 @@ entry: define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %zmm1 -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %zmm1 +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1332: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1333: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1334: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -9099,14 +37951,74 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %zmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %zmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1335: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1336: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1337: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -9122,12 +38034,75 @@ entry: define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1338: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1339: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1340: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -9138,12 +38113,75 @@ entry: } define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1341: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1342: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1343: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -9155,13 +38193,77 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1344: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1345: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1346: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -9174,13 +38276,77 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1347: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1348: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1349: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -9195,13 +38361,77 @@ entry: define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %zmm1 -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %zmm1 +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1350: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1351: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1352: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -9214,14 +38444,79 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %zmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %zmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1353: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1354: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1355: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -9237,11 +38532,125 @@ entry: define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultb_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1356: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1357: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1358: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1359: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1360: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1361: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1362: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1363: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -9252,11 +38661,125 @@ entry: } define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltub (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltub (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1364: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1365: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1366: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1367: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1368: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1369: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1370: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1371: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -9268,12 +38791,127 @@ entry: } define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1372: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1373: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1374: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1375: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1376: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1377: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1378: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1379: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -9286,12 +38924,127 @@ entry: } define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1380: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1381: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1382: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1383: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1384: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1385: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1386: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1387: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -9306,11 +39059,130 @@ entry: define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultb_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1388: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1389: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1390: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1391: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1392: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1393: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1394: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1395: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -9321,11 +39193,130 @@ entry: } define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltub (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltub (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1396: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1397: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1398: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1399: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1400: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1401: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1402: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1403: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -9337,12 +39328,132 @@ entry: } define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1404: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1405: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1406: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1407: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1408: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1409: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1410: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1411: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -9355,12 +39466,132 @@ entry: } define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1412: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1413: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1414: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1415: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1416: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1417: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1418: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1419: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -9375,12 +39606,49 @@ entry: define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultb_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltub %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1420: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1421: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1422: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %1 = bitcast <4 x i64> %__b to <32 x i8> @@ -9391,12 +39659,49 @@ entry: } define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltub (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltub (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1423: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1424: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1425: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %load = load <4 x i64>, <4 x i64>* %__b @@ -9408,13 +39713,59 @@ entry: } define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1426: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1427: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1428: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 +; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %ymm5, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm5, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %1 = bitcast <4 x i64> %__b to <32 x i8> @@ -9427,13 +39778,59 @@ entry: } define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltub (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltub (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1429: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1430: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1431: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm4, %ymm4 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm4, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %load = load <4 x i64>, <4 x i64>* %__b @@ -9448,11 +39845,27 @@ entry: define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -9463,11 +39876,27 @@ entry: } define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -9479,12 +39908,29 @@ entry: } define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -9497,12 +39943,29 @@ entry: } define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -9517,11 +39980,75 @@ entry: define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1432: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1433: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1434: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -9532,11 +40059,75 @@ entry: } define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1435: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1436: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1437: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -9548,12 +40139,77 @@ entry: } define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1438: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1439: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1440: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -9566,12 +40222,77 @@ entry: } define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1441: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1442: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1443: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -9586,11 +40307,80 @@ entry: define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1444: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1445: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1446: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -9601,11 +40391,80 @@ entry: } define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1447: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1448: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1449: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -9617,12 +40476,82 @@ entry: } define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1450: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1451: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1452: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -9635,12 +40564,82 @@ entry: } define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1453: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1454: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1455: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -9655,12 +40654,126 @@ entry: define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1456: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1457: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1458: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1459: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1460: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1461: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1462: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1463: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -9671,12 +40784,126 @@ entry: } define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1464: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1465: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1466: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1467: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1468: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1469: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1470: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1471: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -9688,13 +40915,128 @@ entry: } define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1472: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1473: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1474: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1475: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1476: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1477: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1478: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1479: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -9707,13 +41049,128 @@ entry: } define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1480: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1481: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1482: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1483: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1484: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1485: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1486: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1487: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -9728,12 +41185,131 @@ entry: define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1488: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1489: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1490: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1491: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1492: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1493: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1494: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1495: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -9744,12 +41320,131 @@ entry: } define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1496: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1497: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1498: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1499: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1500: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1501: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1502: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1503: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -9761,13 +41456,133 @@ entry: } define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1504: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1505: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1506: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1507: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1508: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1509: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1510: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1511: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -9780,13 +41595,133 @@ entry: } define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1512: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1513: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1514: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1515: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1516: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1517: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1518: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1519: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -9801,12 +41736,353 @@ entry: define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1520: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1521: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1522: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm2, %ymm3, %ymm3 +; NoVLX-NEXT: vpxor %ymm2, %ymm4, %ymm4 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm4, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpmovsxbd %xmm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %1 = bitcast <8 x i64> %__b to <32 x i16> @@ -9817,12 +42093,268 @@ entry: } define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1523: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1524: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1525: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2 +; NoVLX-NEXT: vpxor 32(%rdi), %ymm1, %ymm3 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %load = load <8 x i64>, <8 x i64>* %__b @@ -9834,13 +42366,363 @@ entry: } define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1526: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1527: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1528: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm8 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm1, %xmm0 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; NoVLX-NEXT: vpmovdb %zmm1, %xmm7 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm6 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm6, %ymm4, %ymm3 +; NoVLX-NEXT: vpxor %ymm6, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm4 +; NoVLX-NEXT: vpxor %ymm6, %ymm8, %ymm2 +; NoVLX-NEXT: vpxor %ymm6, %ymm5, %ymm3 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpand %xmm7, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm0, %xmm4, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %1 = bitcast <8 x i64> %__b to <32 x i16> @@ -9853,13 +42735,278 @@ entry: } define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1529: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1530: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1531: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm4 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm5 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm7 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm2 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm7, %xmm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm5 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm5, %ymm3, %ymm3 +; NoVLX-NEXT: vpxor (%rsi), %ymm5, %ymm6 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm6, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm4 +; NoVLX-NEXT: vpxor 32(%rsi), %ymm5, %ymm5 +; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm5, %ymm4 +; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4 +; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4 +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %load = load <8 x i64>, <8 x i64>* %__b @@ -9874,11 +43021,54 @@ entry: define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -9889,11 +43079,54 @@ entry: } define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -9905,12 +43138,73 @@ entry: } define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -9924,12 +43218,73 @@ entry: } define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -9945,11 +43300,55 @@ entry: define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -9962,12 +43361,74 @@ entry: } define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -9984,11 +43445,53 @@ entry: define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -9999,11 +43502,53 @@ entry: } define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -10015,12 +43560,72 @@ entry: } define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -10034,12 +43639,72 @@ entry: } define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -10055,11 +43720,54 @@ entry: define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -10072,12 +43780,73 @@ entry: } define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -10094,11 +43863,42 @@ entry: define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1532: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1533: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1534: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -10109,11 +43909,42 @@ entry: } define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1535: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1536: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1537: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -10125,12 +43956,61 @@ entry: } define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1538: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1539: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1540: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -10144,12 +44024,61 @@ entry: } define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1541: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1542: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1543: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -10165,11 +44094,43 @@ entry: define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1544: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1545: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1546: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -10182,12 +44143,62 @@ entry: } define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1547: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1548: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1549: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -10204,11 +44215,49 @@ entry: define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1550: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1551: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1552: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -10219,11 +44268,49 @@ entry: } define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1553: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1554: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1555: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -10235,12 +44322,68 @@ entry: } define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1556: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1557: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1558: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -10254,12 +44397,68 @@ entry: } define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1559: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1560: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1561: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -10275,11 +44474,50 @@ entry: define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1562: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1563: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1564: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -10292,12 +44530,69 @@ entry: } define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1565: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1566: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1567: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -10314,21 +44609,23 @@ entry: define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -10340,21 +44637,23 @@ entry: } define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -10367,23 +44666,25 @@ entry: } define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -10397,23 +44698,25 @@ entry: } define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -10429,21 +44732,23 @@ entry: define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem_b: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -10457,23 +44762,25 @@ entry: } define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -10490,12 +44797,72 @@ entry: define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1568: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1569: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1570: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -10506,12 +44873,72 @@ entry: } define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1571: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1572: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1573: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -10523,13 +44950,75 @@ entry: } define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1574: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1575: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1576: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -10542,13 +45031,75 @@ entry: } define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1577: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1578: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1579: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -10563,12 +45114,72 @@ entry: define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1580: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1581: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1582: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -10581,13 +45192,75 @@ entry: } define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1583: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1584: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1585: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -10603,12 +45276,77 @@ entry: define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1586: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1587: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1588: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -10619,12 +45357,77 @@ entry: } define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1589: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1590: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1591: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -10636,13 +45439,80 @@ entry: } define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1592: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1593: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1594: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -10655,13 +45525,80 @@ entry: } define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1595: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1596: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1597: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -10676,12 +45613,77 @@ entry: define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1598: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1599: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1600: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -10694,13 +45696,80 @@ entry: } define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1601: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1602: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1603: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -10716,12 +45785,120 @@ entry: define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1604: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1605: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1606: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1607: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1608: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1609: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1610: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1611: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -10732,12 +45909,120 @@ entry: } define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1612: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1613: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1614: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1615: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1616: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1617: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1618: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1619: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -10749,13 +46034,122 @@ entry: } define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1620: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1621: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1622: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1623: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1624: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1625: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1626: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1627: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -10768,13 +46162,122 @@ entry: } define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1628: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1629: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1630: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1631: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1632: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1633: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1634: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1635: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -10789,12 +46292,120 @@ entry: define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1636: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1637: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1638: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1639: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1640: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1641: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1642: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1643: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -10807,13 +46418,122 @@ entry: } define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1644: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1645: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1646: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1647: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1648: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1649: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1650: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1651: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -10829,12 +46549,125 @@ entry: define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1652: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1653: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1654: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1655: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1656: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1657: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1658: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1659: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -10845,12 +46678,125 @@ entry: } define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1660: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1661: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1662: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1663: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1664: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1665: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1666: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1667: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -10862,13 +46808,127 @@ entry: } define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1668: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1669: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1670: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1671: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1672: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1673: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1674: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1675: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -10881,13 +46941,127 @@ entry: } define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1676: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1677: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1678: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1679: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1680: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1681: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1682: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1683: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -10902,12 +47076,125 @@ entry: define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1684: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1685: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1686: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1687: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1688: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1689: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1690: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1691: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -10920,13 +47207,127 @@ entry: } define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1692: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1693: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1694: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1695: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1696: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1697: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1698: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1699: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -10942,12 +47343,26 @@ entry: define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v4i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -10958,12 +47373,26 @@ entry: } define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -10975,13 +47404,37 @@ entry: } define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -10995,13 +47448,37 @@ entry: } define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -11017,12 +47494,27 @@ entry: define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11035,13 +47527,38 @@ entry: } define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11058,11 +47575,38 @@ entry: define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -11073,11 +47617,38 @@ entry: } define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -11089,12 +47660,49 @@ entry: } define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -11108,12 +47716,49 @@ entry: } define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -11129,11 +47774,39 @@ entry: define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11146,12 +47819,50 @@ entry: } define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11168,11 +47879,37 @@ entry: define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -11183,11 +47920,37 @@ entry: } define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -11199,12 +47962,48 @@ entry: } define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -11218,12 +48017,48 @@ entry: } define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -11239,11 +48074,38 @@ entry: define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11256,12 +48118,49 @@ entry: } define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11278,11 +48177,42 @@ entry: define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1700: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1701: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1702: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -11293,11 +48223,42 @@ entry: } define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1703: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1704: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1705: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -11309,12 +48270,53 @@ entry: } define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1706: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1707: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1708: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -11328,12 +48330,53 @@ entry: } define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1709: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1710: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1711: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -11349,11 +48392,43 @@ entry: define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1712: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1713: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1714: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11366,12 +48441,54 @@ entry: } define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1715: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1716: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1717: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11388,11 +48505,49 @@ entry: define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1718: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1719: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1720: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -11403,11 +48558,49 @@ entry: } define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1721: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1722: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1723: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -11419,12 +48612,60 @@ entry: } define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1724: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1725: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1726: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -11438,12 +48679,60 @@ entry: } define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1727: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1728: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1729: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -11459,11 +48748,50 @@ entry: define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1730: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1731: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1732: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11476,12 +48804,61 @@ entry: } define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1733: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1734: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1735: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11498,12 +48875,56 @@ entry: define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -11514,12 +48935,56 @@ entry: } define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -11531,13 +48996,75 @@ entry: } define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -11551,13 +49078,75 @@ entry: } define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -11573,12 +49162,57 @@ entry: define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -11591,13 +49225,76 @@ entry: } define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -11614,12 +49311,55 @@ entry: define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -11630,12 +49370,55 @@ entry: } define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -11647,13 +49430,74 @@ entry: } define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -11667,13 +49511,74 @@ entry: } define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -11689,12 +49594,56 @@ entry: define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -11707,13 +49656,75 @@ entry: } define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -11730,12 +49741,44 @@ entry: define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1736: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1737: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1738: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -11746,12 +49789,44 @@ entry: } define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1739: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1740: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1741: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -11763,13 +49838,63 @@ entry: } define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1742: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1743: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1744: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -11783,13 +49908,63 @@ entry: } define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1745: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1746: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1747: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -11805,12 +49980,45 @@ entry: define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1748: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1749: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1750: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -11823,13 +50031,64 @@ entry: } define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1751: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1752: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1753: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -11846,12 +50105,51 @@ entry: define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1754: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1755: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1756: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -11862,12 +50160,51 @@ entry: } define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1757: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1758: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1759: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -11879,13 +50216,70 @@ entry: } define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1760: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1761: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1762: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -11899,13 +50293,70 @@ entry: } define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1763: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1764: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1765: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -11921,12 +50372,52 @@ entry: define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1766: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1767: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1768: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -11939,13 +50430,71 @@ entry: } define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1769: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1770: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1771: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -11962,12 +50511,20 @@ entry: define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -11978,12 +50535,20 @@ entry: } define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -11995,13 +50560,22 @@ entry: } define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -12014,13 +50588,22 @@ entry: } define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -12035,12 +50618,20 @@ entry: define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -12053,13 +50644,22 @@ entry: } define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -12075,12 +50675,70 @@ entry: define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1772: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1773: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1774: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -12091,12 +50749,70 @@ entry: } define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1775: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1776: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1777: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -12108,13 +50824,72 @@ entry: } define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1778: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1779: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1780: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -12127,13 +50902,72 @@ entry: } define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1781: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1782: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1783: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -12148,12 +50982,70 @@ entry: define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1784: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1785: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1786: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -12166,13 +51058,72 @@ entry: } define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1787: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1788: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1789: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -12188,12 +51139,75 @@ entry: define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1790: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1791: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1792: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -12204,12 +51218,75 @@ entry: } define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1793: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1794: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1795: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -12221,13 +51298,77 @@ entry: } define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1796: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1797: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1798: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -12240,13 +51381,77 @@ entry: } define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1799: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1800: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1801: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -12261,12 +51466,75 @@ entry: define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1802: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1803: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1804: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -12279,13 +51547,77 @@ entry: } define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1805: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1806: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1807: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -12302,11 +51634,51 @@ entry: declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32) define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %1 = bitcast <2 x i64> %__b to <4 x float> @@ -12317,11 +51689,51 @@ entry: } define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %load = load <2 x i64>, <2 x i64>* %__b @@ -12333,11 +51745,52 @@ entry: } define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1 +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %load = load float, float* %__b @@ -12351,11 +51804,50 @@ entry: define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %1 = bitcast <2 x i64> %__b to <4 x float> @@ -12366,11 +51858,50 @@ entry: } define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %load = load <2 x i64>, <2 x i64>* %__b @@ -12382,11 +51913,51 @@ entry: } define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1 +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %load = load float, float* %__b @@ -12400,11 +51971,39 @@ entry: define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1808: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1809: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1810: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %1 = bitcast <2 x i64> %__b to <4 x float> @@ -12415,11 +52014,39 @@ entry: } define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1811: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1812: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1813: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %load = load <2 x i64>, <2 x i64>* %__b @@ -12431,11 +52058,40 @@ entry: } define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1814: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1815: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1816: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1 +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %load = load float, float* %__b @@ -12449,11 +52105,46 @@ entry: define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1817: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1818: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1819: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %1 = bitcast <2 x i64> %__b to <4 x float> @@ -12464,11 +52155,46 @@ entry: } define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1820: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1821: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1822: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %load = load <2 x i64>, <2 x i64>* %__b @@ -12480,11 +52206,47 @@ entry: } define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1823: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1824: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1825: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1 +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %load = load float, float* %__b @@ -12498,21 +52260,23 @@ entry: define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -12524,21 +52288,23 @@ entry: } define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovaps (%rdi), %ymm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -12551,21 +52317,23 @@ entry: } define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -12580,12 +52348,72 @@ entry: define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1826: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1827: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1828: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> %1 = bitcast <4 x i64> %__b to <8 x float> @@ -12596,12 +52424,72 @@ entry: } define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1829: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1830: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1831: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovaps (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> %load = load <4 x i64>, <4 x i64>* %__b @@ -12613,12 +52501,72 @@ entry: } define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1832: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1833: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1834: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> %load = load float, float* %__b @@ -12632,12 +52580,77 @@ entry: define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1835: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1836: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1837: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> %1 = bitcast <4 x i64> %__b to <8 x float> @@ -12648,12 +52661,77 @@ entry: } define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1838: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1839: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1840: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vmovaps (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> %load = load <4 x i64>, <4 x i64>* %__b @@ -12665,12 +52743,77 @@ entry: } define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1841: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1842: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1843: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> %load = load float, float* %__b @@ -12684,12 +52827,120 @@ entry: define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1844: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1845: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1846: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1847: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1848: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1849: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1850: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1851: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> %1 = bitcast <8 x i64> %__b to <16 x float> @@ -12700,12 +52951,120 @@ entry: } define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1852: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1853: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1854: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1855: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1856: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1857: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1858: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1859: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> %load = load <8 x i64>, <8 x i64>* %__b @@ -12717,12 +53076,120 @@ entry: } define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1860: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1861: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1862: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1863: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1864: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1865: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1866: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1867: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> %load = load float, float* %__b @@ -12736,12 +53203,18 @@ entry: define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovw %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> %1 = bitcast <8 x i64> %__b to <16 x float> @@ -12752,12 +53225,125 @@ entry: define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1868: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1869: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1870: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1871: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1872: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1873: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1874: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1875: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> %1 = bitcast <8 x i64> %__b to <16 x float> @@ -12768,12 +53354,125 @@ entry: } define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1876: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1877: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1878: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1879: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1880: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1881: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1882: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1883: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> %load = load <8 x i64>, <8 x i64>* %__b @@ -12785,12 +53484,125 @@ entry: } define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1884: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1885: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1886: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1887: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1888: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1889: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1890: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1891: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> %load = load float, float* %__b @@ -12804,13 +53616,20 @@ entry: define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: movzwl %ax, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzwl %ax, %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> %1 = bitcast <8 x i64> %__b to <16 x float> @@ -12822,12 +53641,23 @@ entry: declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32) define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %1 = bitcast <2 x i64> %__b to <2 x double> @@ -12838,12 +53668,23 @@ entry: } define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load <2 x i64>, <2 x i64>* %__b @@ -12855,12 +53696,24 @@ entry: } define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load double, double* %__b @@ -12874,11 +53727,35 @@ entry: define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %1 = bitcast <2 x i64> %__b to <2 x double> @@ -12889,11 +53766,35 @@ entry: } define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load <2 x i64>, <2 x i64>* %__b @@ -12905,11 +53806,36 @@ entry: } define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load double, double* %__b @@ -12923,11 +53849,34 @@ entry: define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %1 = bitcast <2 x i64> %__b to <2 x double> @@ -12938,11 +53887,34 @@ entry: } define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load <2 x i64>, <2 x i64>* %__b @@ -12954,11 +53926,35 @@ entry: } define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load double, double* %__b @@ -12972,11 +53968,39 @@ entry: define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1892: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1893: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1894: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %1 = bitcast <2 x i64> %__b to <2 x double> @@ -12987,11 +54011,39 @@ entry: } define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1895: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1896: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1897: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load <2 x i64>, <2 x i64>* %__b @@ -13003,11 +54055,40 @@ entry: } define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1898: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1899: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1900: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load double, double* %__b @@ -13021,11 +54102,46 @@ entry: define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1901: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1902: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1903: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %1 = bitcast <2 x i64> %__b to <2 x double> @@ -13036,11 +54152,46 @@ entry: } define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1904: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1905: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1906: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load <2 x i64>, <2 x i64>* %__b @@ -13052,11 +54203,47 @@ entry: } define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1907: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1908: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1909: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load double, double* %__b @@ -13070,12 +54257,53 @@ entry: define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %1 = bitcast <4 x i64> %__b to <4 x double> @@ -13086,12 +54314,53 @@ entry: } define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %load = load <4 x i64>, <4 x i64>* %__b @@ -13103,12 +54372,54 @@ entry: } define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %load = load double, double* %__b @@ -13122,12 +54433,52 @@ entry: define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %1 = bitcast <4 x i64> %__b to <4 x double> @@ -13138,12 +54489,52 @@ entry: } define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %load = load <4 x i64>, <4 x i64>* %__b @@ -13155,12 +54546,53 @@ entry: } define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %load = load double, double* %__b @@ -13174,12 +54606,41 @@ entry: define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1910: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1911: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1912: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %1 = bitcast <4 x i64> %__b to <4 x double> @@ -13190,12 +54651,41 @@ entry: } define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1913: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1914: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1915: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %load = load <4 x i64>, <4 x i64>* %__b @@ -13207,12 +54697,42 @@ entry: } define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1916: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1917: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1918: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %load = load double, double* %__b @@ -13226,12 +54746,48 @@ entry: define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1919: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1920: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1921: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %1 = bitcast <4 x i64> %__b to <4 x double> @@ -13242,12 +54798,48 @@ entry: } define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1922: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1923: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1924: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %load = load <4 x i64>, <4 x i64>* %__b @@ -13259,12 +54851,49 @@ entry: } define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1925: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1926: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1927: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %load = load double, double* %__b @@ -13278,12 +54907,20 @@ entry: define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %1 = bitcast <8 x i64> %__b to <8 x double> @@ -13294,12 +54931,20 @@ entry: } define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %load = load <8 x i64>, <8 x i64>* %__b @@ -13311,12 +54956,20 @@ entry: } define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %load = load double, double* %__b @@ -13330,12 +54983,22 @@ entry: define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: movzbl %al, %eax +; VLX-NEXT: # kill: %AX %AX %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: # kill: %AX %AX %EAX +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %1 = bitcast <8 x i64> %__b to <8 x double> @@ -13346,12 +55009,70 @@ entry: define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1928: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1929: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1930: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %1 = bitcast <8 x i64> %__b to <8 x double> @@ -13362,12 +55083,70 @@ entry: } define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1931: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1932: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1933: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %load = load <8 x i64>, <8 x i64>* %__b @@ -13379,12 +55158,70 @@ entry: } define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1934: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1935: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1936: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %load = load double, double* %__b @@ -13398,12 +55235,19 @@ entry: define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %1 = bitcast <8 x i64> %__b to <8 x double> @@ -13414,12 +55258,75 @@ entry: define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1937: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1938: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1939: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %1 = bitcast <8 x i64> %__b to <8 x double> @@ -13430,12 +55337,75 @@ entry: } define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1940: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1941: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1942: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %load = load <8 x i64>, <8 x i64>* %__b @@ -13447,12 +55417,75 @@ entry: } define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1943: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1944: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1945: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %load = load double, double* %__b @@ -13466,13 +55499,20 @@ entry: define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: movzbl %al, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %1 = bitcast <8 x i64> %__b to <8 x double> -- 2.7.4