From a207436b01b198c1fad9888d4b7953ccdb1ee830 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 8 Feb 2016 23:03:46 +0000 Subject: [PATCH] [X86][SSE1] Add MOVLHPS/MOVHLPS lowering and memory folding support As discussed on PR26491, this patch adds support for lowering v4f32 shuffles to the MOVLHPS/MOVHLPS instructions. It also adds support for memory folding with their MOVLPS/MOVHPS load equivalents. This first patch only really helps SSE1 targets as SSE2+ targets will widen the shuffle mask and use v2f64 equivalents (although they still combine to MOVLHPS/MOVHLPS for v2f64 splats). This will have to be addressed in a future patch, most likely when we add support for binary target shuffle combines. Differential Revision: http://reviews.llvm.org/D16956 llvm-svn: 260168 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 6 ++++++ llvm/lib/Target/X86/X86InstrInfo.cpp | 23 ++++++++++++++++++++++ llvm/test/CodeGen/X86/vector-shuffle-sse1.ll | 29 ++++++++++++++++++++++------ 3 files changed, 52 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f853c79..51b02e8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -9022,6 +9022,12 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return BlendPerm; } + // Use low/high mov instructions. + if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) + return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7})) + return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1); + // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 092e443..bc73440 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -998,6 +998,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MINSDrr_Int, X86::MINSDrm_Int, 0 }, { X86::MINSSrr, X86::MINSSrm, 0 }, { X86::MINSSrr_Int, X86::MINSSrm_Int, 0 }, + { X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE }, { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 }, { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 }, { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 }, @@ -1298,6 +1299,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 }, { X86::VMINSSrr, X86::VMINSSrm, 0 }, { X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 }, + { X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE }, { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 }, { X86::VMULPDrr, X86::VMULPDrm, 0 }, { X86::VMULPSrr, X86::VMULPSrm, 0 }, @@ -5531,6 +5533,23 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( } } break; + case X86::MOVHLPSrr: + case X86::VMOVHLPSrr: + // Move the upper 64-bits of the second operand to the lower 64-bits. + // To fold the load, adjust the pointer to the upper and use (V)MOVLPS. + // TODO: In most cases AVX doesn't have a 8-byte alignment requirement. + if (OpNum == 2) { + unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize(); + if (Size <= RCSize && 8 <= Align) { + unsigned NewOpCode = + (MI->getOpcode() == X86::VMOVHLPSrr ? X86::VMOVLPSrm + : X86::MOVLPSrm); + MachineInstr *NewMI = + FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8); + return NewMI; + } + } + break; }; return nullptr; @@ -5741,6 +5760,10 @@ static bool hasPartialRegUpdate(unsigned Opcode) { case X86::CVTSS2SDrm: case X86::Int_CVTSS2SDrr: case X86::Int_CVTSS2SDrm: + case X86::MOVHPDrm: + case X86::MOVHPSrm: + case X86::MOVLPDrm: + case X86::MOVLPSrm: case X86::RCPSSr: case X86::RCPSSm: case X86::RCPSSr_Int: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-sse1.ll b/llvm/test/CodeGen/X86/vector-shuffle-sse1.ll index 75c0f57..fc4652e 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-sse1.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-sse1.ll @@ -94,7 +94,7 @@ define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) { define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) { ; SSE1-LABEL: shuffle_v4f32_0145: ; SSE1: # BB#0: -; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1] +; SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE1-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -102,8 +102,7 @@ define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) { define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) { ; SSE1-LABEL: shuffle_v4f32_6723: ; SSE1: # BB#0: -; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[2,3] -; SSE1-NEXT: movaps %xmm1, %xmm0 +; SSE1-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE1-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -211,7 +210,7 @@ define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) { ; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE1-NEXT: xorps %xmm2, %xmm2 -; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE1-NEXT: movaps %xmm1, %xmm0 ; SSE1-NEXT: retq @@ -232,8 +231,8 @@ define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) { ; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE1-NEXT: xorps %xmm2, %xmm2 -; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1] +; SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE1-NEXT: retq %a = load <2 x float>, <2 x float>* %ptr %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> @@ -251,3 +250,21 @@ define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) { %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> ret <4 x float> %shuffle } +define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, <4 x float>* %pb) { +; SSE1-LABEL: shuffle_mem_v4f32_0145: +; SSE1: # BB#0: +; SSE1-NEXT: movhps (%rdi), %xmm0 +; SSE1-NEXT: retq + %b = load <4 x float>, <4 x float>* %pb, align 16 + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_mem_v4f32_6723(<4 x float> %a, <4 x float>* %pb) { +; SSE1-LABEL: shuffle_mem_v4f32_6723: +; SSE1: # BB#0: +; SSE1-NEXT: movlps 8(%rdi), %xmm0 +; SSE1-NEXT: retq + %b = load <4 x float>, <4 x float>* %pb, align 16 + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + ret <4 x float> %shuffle +} -- 2.7.4