From a74b1faba242e0ca4608e5d90495766a272758f0 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 20 Sep 2020 13:53:26 -0700 Subject: [PATCH] [X86] Make reduceMaskedLoadToScalarLoad/reduceMaskedStoreToScalarStore work for avx512 after type legalization. The scalar elements of the vXi1 build_vector will have been type legalized to i8 by padding with 0s. So we can't check for all ones. Instead we should just look at bit 0 of the constant. Differential Revision: https://reviews.llvm.org/D87863 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +- llvm/test/CodeGen/X86/masked_load.ll | 24 ++++++++-------- llvm/test/CodeGen/X86/masked_store.ll | 51 +++++++++------------------------ 3 files changed, 26 insertions(+), 51 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f0c66cc..1449490 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -44454,7 +44454,7 @@ static int getOneTrueElt(SDValue V) { auto *ConstNode = dyn_cast(Op); if (!ConstNode) return -1; - if (ConstNode->getAPIntValue().isAllOnesValue()) { + if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) { // If we already found a one, this is too many. if (TrueIndex >= 0) return -1; diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll index 44938d0..7d1e295 100644 --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -7235,43 +7235,43 @@ define <16 x i64> @load_one_mask_bit_set6(<16 x i64>* %addr, <16 x i64> %val) { ; ; AVX512F-LABEL: load_one_mask_bit_set6: ; AVX512F: ## %bb.0: -; AVX512F-NEXT: movb $4, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} ; AVX512F-NEXT: movb $36, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vpinsrq $0, 16(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vinserti32x4 $1, %xmm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VLDQ-LABEL: load_one_mask_bit_set6: ; AVX512VLDQ: ## %bb.0: -; AVX512VLDQ-NEXT: movb $4, %al -; AVX512VLDQ-NEXT: kmovw %eax, %k1 -; AVX512VLDQ-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} ; AVX512VLDQ-NEXT: movb $36, %al ; AVX512VLDQ-NEXT: kmovw %eax, %k1 ; AVX512VLDQ-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VLDQ-NEXT: vpinsrq $0, 16(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti32x4 $1, %xmm2, %zmm0, %zmm0 ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: load_one_mask_bit_set6: ; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: movb $4, %al -; AVX512VLBW-NEXT: kmovd %eax, %k1 -; AVX512VLBW-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} ; AVX512VLBW-NEXT: movb $36, %al ; AVX512VLBW-NEXT: kmovd %eax, %k1 ; AVX512VLBW-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} +; AVX512VLBW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VLBW-NEXT: vpinsrq $0, 16(%rdi), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vinserti32x4 $1, %xmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq ; ; X86-AVX512-LABEL: load_one_mask_bit_set6: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: movb $4, %cl -; X86-AVX512-NEXT: kmovd %ecx, %k1 -; X86-AVX512-NEXT: vmovdqu64 (%eax), %zmm0 {%k1} ; X86-AVX512-NEXT: movb $36, %cl ; X86-AVX512-NEXT: kmovd %ecx, %k1 ; X86-AVX512-NEXT: vmovdqu64 64(%eax), %zmm1 {%k1} +; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX512-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; X86-AVX512-NEXT: vinsertf32x4 $1, %xmm2, %zmm0, %zmm0 ; X86-AVX512-NEXT: retl %res = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %addr, i32 4, <16 x i1> , <16 x i64> %val) ret <16 x i64> %res diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index 319d946..36a2793 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -4943,48 +4943,23 @@ define void @one_mask_bit_set6(<16 x i64>* %addr, <16 x i64> %val) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: one_mask_bit_set6: -; AVX512F: ## %bb.0: -; AVX512F-NEXT: movb $8, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 %zmm1, 64(%rdi) {%k1} -; AVX512F-NEXT: movb $64, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi) {%k1} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VLDQ-LABEL: one_mask_bit_set6: -; AVX512VLDQ: ## %bb.0: -; AVX512VLDQ-NEXT: movb $8, %al -; AVX512VLDQ-NEXT: kmovw %eax, %k1 -; AVX512VLDQ-NEXT: vmovdqu64 %zmm1, 64(%rdi) {%k1} -; AVX512VLDQ-NEXT: movb $64, %al -; AVX512VLDQ-NEXT: kmovw %eax, %k1 -; AVX512VLDQ-NEXT: vmovdqu64 %zmm0, (%rdi) {%k1} -; AVX512VLDQ-NEXT: vzeroupper -; AVX512VLDQ-NEXT: retq -; -; AVX512VLBW-LABEL: one_mask_bit_set6: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: movb $8, %al -; AVX512VLBW-NEXT: kmovd %eax, %k1 -; AVX512VLBW-NEXT: vmovdqu64 %zmm1, 64(%rdi) {%k1} -; AVX512VLBW-NEXT: movb $64, %al -; AVX512VLBW-NEXT: kmovd %eax, %k1 -; AVX512VLBW-NEXT: vmovdqu64 %zmm0, (%rdi) {%k1} -; AVX512VLBW-NEXT: vzeroupper -; AVX512VLBW-NEXT: retq +; AVX512-LABEL: one_mask_bit_set6: +; AVX512: ## %bb.0: +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vmovlps %xmm0, 48(%rdi) +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512-NEXT: vpextrq $1, %xmm0, 88(%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; X86-AVX512-LABEL: one_mask_bit_set6: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: movb $8, %cl -; X86-AVX512-NEXT: kmovd %ecx, %k1 -; X86-AVX512-NEXT: vmovdqu64 %zmm1, 64(%eax) {%k1} -; X86-AVX512-NEXT: movb $64, %cl -; X86-AVX512-NEXT: kmovd %ecx, %k1 -; X86-AVX512-NEXT: vmovdqu64 %zmm0, (%eax) {%k1} +; X86-AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; X86-AVX512-NEXT: vmovlps %xmm0, 48(%eax) +; X86-AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0 +; X86-AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X86-AVX512-NEXT: vmovlps %xmm0, 88(%eax) ; X86-AVX512-NEXT: vzeroupper ; X86-AVX512-NEXT: retl call void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %val, <16 x i64>* %addr, i32 4, <16 x i1>) -- 2.7.4