From 5181255696e182a3b545328574f7e56dc0e20077 Mon Sep 17 00:00:00 2001 From: Filipe Cabecinhas Date: Fri, 6 Jun 2014 18:07:06 +0000 Subject: [PATCH] Fixed a bug in lowering shuffle_vectors to insertps Summary: We were being too strict and not accounting for undefs. Added a test case and fixed another one where we improved codegen. Reviewers: grosbach, nadav, delena Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D4039 llvm-svn: 210361 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 29 ++++++++++++++++++++--------- llvm/test/CodeGen/X86/avx-shuffle.ll | 6 ++++-- llvm/test/CodeGen/X86/sse41.ll | 11 +++++++++++ 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 610be3f..9e728c7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3964,14 +3964,22 @@ static bool isINSERTPSMask(ArrayRef Mask, MVT VT) { unsigned CorrectPosV1 = 0; unsigned CorrectPosV2 = 0; - for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) + for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) { + if (Mask[i] == -1) { + ++CorrectPosV1; + ++CorrectPosV2; + continue; + } + if (Mask[i] == i) ++CorrectPosV1; else if (Mask[i] == i + 4) ++CorrectPosV2; + } if (CorrectPosV1 == 3 || CorrectPosV2 == 3) - // We have 3 elements from one vector, and one from another. + // We have 3 elements (undefs count as elements from any vector) from one + // vector, and one from another. return true; return false; @@ -7462,8 +7470,9 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, assert((VT == MVT::v4f32 || VT == MVT::v4i32) && "unsupported vector type for insertps/pinsrd"); - int FromV1 = std::count_if(Mask.begin(), Mask.end(), - [](const int &i) { return i < 4; }); + auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; }; + auto FromV2Predicate = [](const int &i) { return i >= 4; }; + int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate); SDValue From; SDValue To; @@ -7471,15 +7480,17 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, if (FromV1 == 1) { From = V1; To = V2; - DestIndex = std::find_if(Mask.begin(), Mask.end(), - [](const int &i) { return i < 4; }) - + DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) - Mask.begin(); } else { + assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 && + "More than one element from V1 and from V2, or no elements from one " + "of the vectors. This case should not have returned true from " + "isINSERTPSMask"); From = V2; To = V1; - DestIndex = std::find_if(Mask.begin(), Mask.end(), - [](const int &i) { return i >= 4; }) - - Mask.begin(); + DestIndex = + std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin(); } if (MayFoldLoad(From)) { diff --git a/llvm/test/CodeGen/X86/avx-shuffle.ll b/llvm/test/CodeGen/X86/avx-shuffle.ll index f407ba4..f3f7e55 100644 --- a/llvm/test/CodeGen/X86/avx-shuffle.ll +++ b/llvm/test/CodeGen/X86/avx-shuffle.ll @@ -5,8 +5,10 @@ define <4 x float> @test1(<4 x float> %a) nounwind { %b = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> ret <4 x float> %b ; CHECK-LABEL: test1: -; CHECK: vshufps -; CHECK: vpshufd +;; TODO: This test could be improved by removing the xor instruction and +;; having vinsertps zero out the needed elements. +; CHECK: vxorps +; CHECK: vinsertps } ; rdar://10538417 diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll index a3c6201..a77ede2 100644 --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -692,3 +692,14 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl %13 = fadd <4 x float> %11, %12 ret <4 x float> %13 } + +define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) { +; CHECK-LABEL: insertps_with_undefs: +; CHECK-NOT: shufps +; CHECK: insertps $32, %xmm0 +; CHECK: ret + %1 = load float* %b, align 4 + %2 = insertelement <4 x float> undef, float %1, i32 0 + %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> + ret <4 x float> %result +} -- 2.7.4