From a533e87792caf3cf30f2368672b0d952a7b07786 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 23 Sep 2019 01:05:33 +0000 Subject: [PATCH] [X86][SelectionDAGBuilder] Move the hack for handling MMX shift by i32 intrinsics into the X86 backend. This intrinsics should be shift by immediate, but gcc allows any i32 scalar and clang needs to match that. So we try to detect the non-constant case and move the data from an integer register to an MMX register. Previously this was done by creating a v2i32 build_vector and bitcast in SelectionDAGBuilder. This had to be done early since v2i32 isn't a legal type. The bitcast+build_vector would be DAG combined to X86ISD::MMX_MOVW2D which isel will turn into a GPR->MMX MOVD. This commit just moves the whole thing to lowering and emits the X86ISD::MMX_MOVW2D directly to avoid the illegal type. The test changes just seem to be due to nodes being linearized in a different order. llvm-svn: 372535 --- .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 59 ---------------------- llvm/lib/Target/X86/X86ISelLowering.cpp | 52 +++++++++++++++++++ llvm/test/CodeGen/X86/bitcast-mmx.ll | 38 +++++++------- llvm/test/CodeGen/X86/mmx-fold-load.ll | 16 +++--- 4 files changed, 79 insertions(+), 86 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 98f7d87..7f9266b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5969,65 +5969,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, case Intrinsic::masked_compressstore: visitMaskedStore(I, true /* IsCompressing */); return; - case Intrinsic::x86_mmx_pslli_w: - case Intrinsic::x86_mmx_pslli_d: - case Intrinsic::x86_mmx_pslli_q: - case Intrinsic::x86_mmx_psrli_w: - case Intrinsic::x86_mmx_psrli_d: - case Intrinsic::x86_mmx_psrli_q: - case Intrinsic::x86_mmx_psrai_w: - case Intrinsic::x86_mmx_psrai_d: { - SDValue ShAmt = getValue(I.getArgOperand(1)); - if (isa(ShAmt)) { - visitTargetIntrinsic(I, Intrinsic); - return; - } - unsigned NewIntrinsic = 0; - EVT ShAmtVT = MVT::v2i32; - switch (Intrinsic) { - case Intrinsic::x86_mmx_pslli_w: - NewIntrinsic = Intrinsic::x86_mmx_psll_w; - break; - case Intrinsic::x86_mmx_pslli_d: - NewIntrinsic = Intrinsic::x86_mmx_psll_d; - break; - case Intrinsic::x86_mmx_pslli_q: - NewIntrinsic = Intrinsic::x86_mmx_psll_q; - break; - case Intrinsic::x86_mmx_psrli_w: - NewIntrinsic = Intrinsic::x86_mmx_psrl_w; - break; - case Intrinsic::x86_mmx_psrli_d: - NewIntrinsic = Intrinsic::x86_mmx_psrl_d; - break; - case Intrinsic::x86_mmx_psrli_q: - NewIntrinsic = Intrinsic::x86_mmx_psrl_q; - break; - case Intrinsic::x86_mmx_psrai_w: - NewIntrinsic = Intrinsic::x86_mmx_psra_w; - break; - case Intrinsic::x86_mmx_psrai_d: - NewIntrinsic = Intrinsic::x86_mmx_psra_d; - break; - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - } - - // The vector shift intrinsics with scalars uses 32b shift amounts but - // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits - // to be zero. - // We must do this early because v2i32 is not a legal type. - SDValue ShOps[2]; - ShOps[0] = ShAmt; - ShOps[1] = DAG.getConstant(0, sdl, MVT::i32); - ShAmt = DAG.getBuildVector(ShAmtVT, sdl, ShOps); - EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); - ShAmt = DAG.getNode(ISD::BITCAST, sdl, DestVT, ShAmt); - Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, DestVT, - DAG.getConstant(NewIntrinsic, sdl, MVT::i32), - getValue(I.getArgOperand(0)), ShAmt); - setValue(&I, Res); - return; - } case Intrinsic::powi: setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)), DAG)); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index dec3773..b666be7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -23738,6 +23738,58 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, MaskVT, Operation); return DAG.getMergeValues({Result0, Result1}, DL); } + case Intrinsic::x86_mmx_pslli_w: + case Intrinsic::x86_mmx_pslli_d: + case Intrinsic::x86_mmx_pslli_q: + case Intrinsic::x86_mmx_psrli_w: + case Intrinsic::x86_mmx_psrli_d: + case Intrinsic::x86_mmx_psrli_q: + case Intrinsic::x86_mmx_psrai_w: + case Intrinsic::x86_mmx_psrai_d: { + SDValue ShAmt = Op.getOperand(2); + // If the argument is a constant, this is fine. + if (isa(ShAmt)) + return Op; + + unsigned NewIntrinsic; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_mmx_pslli_w: + NewIntrinsic = Intrinsic::x86_mmx_psll_w; + break; + case Intrinsic::x86_mmx_pslli_d: + NewIntrinsic = Intrinsic::x86_mmx_psll_d; + break; + case Intrinsic::x86_mmx_pslli_q: + NewIntrinsic = Intrinsic::x86_mmx_psll_q; + break; + case Intrinsic::x86_mmx_psrli_w: + NewIntrinsic = Intrinsic::x86_mmx_psrl_w; + break; + case Intrinsic::x86_mmx_psrli_d: + NewIntrinsic = Intrinsic::x86_mmx_psrl_d; + break; + case Intrinsic::x86_mmx_psrli_q: + NewIntrinsic = Intrinsic::x86_mmx_psrl_q; + break; + case Intrinsic::x86_mmx_psrai_w: + NewIntrinsic = Intrinsic::x86_mmx_psra_w; + break; + case Intrinsic::x86_mmx_psrai_d: + NewIntrinsic = Intrinsic::x86_mmx_psra_d; + break; + } + + // The vector shift intrinsics with scalars uses 32b shift amounts but + // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an + // MMX register. + SDLoc DL(Op); + ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), + DAG.getConstant(NewIntrinsic, DL, MVT::i32), + Op.getOperand(1), ShAmt); + + } } } diff --git a/llvm/test/CodeGen/X86/bitcast-mmx.ll b/llvm/test/CodeGen/X86/bitcast-mmx.ll index 343c230..566cb1b 100644 --- a/llvm/test/CodeGen/X86/bitcast-mmx.ll +++ b/llvm/test/CodeGen/X86/bitcast-mmx.ll @@ -34,10 +34,10 @@ define i64 @t1(i64 %x, i32 %n) nounwind { ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 16(%ebp), %mm0 -; X86-NEXT: movq 8(%ebp), %mm1 -; X86-NEXT: psllq %mm0, %mm1 -; X86-NEXT: movq %mm1, (%esp) +; X86-NEXT: movq 8(%ebp), %mm0 +; X86-NEXT: movd 16(%ebp), %mm1 +; X86-NEXT: psllq %mm1, %mm0 +; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %ebp, %esp @@ -46,10 +46,10 @@ define i64 @t1(i64 %x, i32 %n) nounwind { ; ; X64-LABEL: t1: ; X64: # %bb.0: # %entry -; X64-NEXT: movd %esi, %mm0 -; X64-NEXT: movq %rdi, %mm1 -; X64-NEXT: psllq %mm0, %mm1 -; X64-NEXT: movq %mm1, %rax +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: movd %esi, %mm1 +; X64-NEXT: psllq %mm1, %mm0 +; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: %0 = bitcast i64 %x to x86_mmx @@ -65,11 +65,11 @@ define i64 @t2(i64 %x, i32 %n, i32 %w) nounwind { ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: movd 16(%ebp), %mm0 -; X86-NEXT: movd 20(%ebp), %mm1 -; X86-NEXT: psllq %mm0, %mm1 -; X86-NEXT: por 8(%ebp), %mm1 -; X86-NEXT: movq %mm1, (%esp) +; X86-NEXT: movd 20(%ebp), %mm0 +; X86-NEXT: movd 16(%ebp), %mm1 +; X86-NEXT: psllq %mm1, %mm0 +; X86-NEXT: por 8(%ebp), %mm0 +; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %ebp, %esp @@ -78,12 +78,12 @@ define i64 @t2(i64 %x, i32 %n, i32 %w) nounwind { ; ; X64-LABEL: t2: ; X64: # %bb.0: # %entry -; X64-NEXT: movd %esi, %mm0 -; X64-NEXT: movd %edx, %mm1 -; X64-NEXT: psllq %mm0, %mm1 -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: por %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movd %edx, %mm0 +; X64-NEXT: movd %esi, %mm1 +; X64-NEXT: psllq %mm1, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: por %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = insertelement <2 x i32> undef, i32 %w, i32 0 diff --git a/llvm/test/CodeGen/X86/mmx-fold-load.ll b/llvm/test/CodeGen/X86/mmx-fold-load.ll index 71b8b40..5ad2d50 100644 --- a/llvm/test/CodeGen/X86/mmx-fold-load.ll +++ b/llvm/test/CodeGen/X86/mmx-fold-load.ll @@ -585,22 +585,22 @@ define void @test_psrlq_by_volatile_shift_amount(x86_mmx* %t) nounwind { ; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl $1, (%esp) -; X86-NEXT: movd (%esp), %mm0 ; X86-NEXT: movl $255, %ecx -; X86-NEXT: movd %ecx, %mm1 -; X86-NEXT: psrlq %mm0, %mm1 -; X86-NEXT: movq %mm1, (%eax) +; X86-NEXT: movd %ecx, %mm0 +; X86-NEXT: movd (%esp), %mm1 +; X86-NEXT: psrlq %mm1, %mm0 +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: popl %eax ; X86-NEXT: retl ; ; X64-LABEL: test_psrlq_by_volatile_shift_amount: ; X64: # %bb.0: # %entry ; X64-NEXT: movl $1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movd -{{[0-9]+}}(%rsp), %mm0 ; X64-NEXT: movl $255, %eax -; X64-NEXT: movd %eax, %mm1 -; X64-NEXT: psrlq %mm0, %mm1 -; X64-NEXT: movq %mm1, (%rdi) +; X64-NEXT: movd %eax, %mm0 +; X64-NEXT: movd -{{[0-9]+}}(%rsp), %mm1 +; X64-NEXT: psrlq %mm1, %mm0 +; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: retq entry: %0 = alloca i32, align 4 -- 2.7.4