From 1ffd8e8114bc34295380b7fed815c7cf7644e84e Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 22 Mar 2019 20:46:56 +0000 Subject: [PATCH] [X86] Use movq for i64 atomic load on 32-bit targets when sse2 is enable We used a lock cmpxchg8b to do i64 atomic loads. But if we have SSE2 we can do better and use a plain movq to do the load instead. I tried to just use an f64 atomic load and add isel patterns to MOVSD(which the domain fixing pass can turn to MOVQ), but the atomic_load SDNode in TargetSelectionDAG.td requires the type to be integer. So I've emitted VZEXT_LOAD instead which should be selected by isel to a MOVQ. Hopefully we don't need a specific atomic flavor of this. I kept the memory operand from the original AtomicSDNode. I wasn't sure if I might need to set the MOVolatile flag? I've left some FIXMEs for improvements we can do without SSE2. Differential Revision: https://reviews.llvm.org/D59679 llvm-svn: 356807 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 49 +++++++++++++++++++++--- llvm/test/CodeGen/X86/atomic-load-store-wide.ll | 38 ++++++++++++++++++- llvm/test/CodeGen/X86/atomic-non-integer.ll | 50 ++++--------------------- 3 files changed, 88 insertions(+), 49 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5de8ea9..dc3292e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -485,6 +485,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } + if (!Subtarget.is64Bit()) + setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); + if (Subtarget.hasCmpxchg16b()) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); } @@ -25494,11 +25497,22 @@ bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { } // Note: this turns large loads into lock cmpxchg8b/16b. -// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. +// TODO: In 32-bit mode, use MOVLPS when SSE1 is available? +// TODO: In 32-bit mode, use FILD/FISTP when X87 is available? TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { - return needsCmpXchgNb(LI->getType()) ? AtomicExpansionKind::CmpXChg - : AtomicExpansionKind::None; + Type *MemType = LI->getType(); + + // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we + // can use movq to do the load. + bool NoImplicitFloatOps = + LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); + if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && + !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2()) + return AtomicExpansionKind::None; + + return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg + : AtomicExpansionKind::None; } TargetLowering::AtomicExpansionKind @@ -27312,6 +27326,32 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(EFLAGS.getValue(1)); return; } + case ISD::ATOMIC_LOAD: { + assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); + bool NoImplicitFloatOps = + DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat); + if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps && + Subtarget.hasSSE2()) { + auto *Node = cast(N); + // Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the lower + // 64-bits. + SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); + SDValue Ops[] = { Node->getChain(), Node->getBasePtr() }; + SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, + MVT::i64, Node->getMemOperand()); + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); + Results.push_back(Ld.getValue(1)); + return; + } + // TODO: Use MOVLPS when SSE1 is available? + // TODO: Use FILD/FISTP when X87 is available? + // Delegate to generic TypeLegalization. Situations we can really handle + // should have already been dealt with by AtomicExpandPass.cpp. + break; + } case ISD::ATOMIC_SWAP: case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_SUB: @@ -27323,11 +27363,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: - case ISD::ATOMIC_LOAD: { // Delegate to generic TypeLegalization. Situations we can really handle // should have already been dealt with by AtomicExpandPass.cpp. break; - } + case ISD::BITCAST: { assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT DstVT = N->getValueType(0); diff --git a/llvm/test/CodeGen/X86/atomic-load-store-wide.ll b/llvm/test/CodeGen/X86/atomic-load-store-wide.ll index 25fdbe0..2841b8c 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store-wide.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store-wide.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mcpu=corei7 -mtriple=i686-- -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mcpu=corei7 -mtriple=i686-- -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SSE42 +; RUN: llc < %s -mtriple=i686-- -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=NOSSE ; 64-bit load/store on x86-32 ; FIXME: The generated code can be substantially improved. @@ -34,7 +35,40 @@ define void @test1(i64* %ptr, i64 %val1) { } define i64 @test2(i64* %ptr) { -; CHECK-LABEL: test2: +; SSE42-LABEL: test2: +; SSE42: # %bb.0: +; SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE42-NEXT: movd %xmm0, %eax +; SSE42-NEXT: pextrd $1, %xmm0, %edx +; SSE42-NEXT: retl +; +; NOSSE-LABEL: test2: +; NOSSE: # %bb.0: +; NOSSE-NEXT: pushl %ebx +; NOSSE-NEXT: .cfi_def_cfa_offset 8 +; NOSSE-NEXT: pushl %esi +; NOSSE-NEXT: .cfi_def_cfa_offset 12 +; NOSSE-NEXT: .cfi_offset %esi, -12 +; NOSSE-NEXT: .cfi_offset %ebx, -8 +; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; NOSSE-NEXT: xorl %eax, %eax +; NOSSE-NEXT: xorl %edx, %edx +; NOSSE-NEXT: xorl %ecx, %ecx +; NOSSE-NEXT: xorl %ebx, %ebx +; NOSSE-NEXT: lock cmpxchg8b (%esi) +; NOSSE-NEXT: popl %esi +; NOSSE-NEXT: .cfi_def_cfa_offset 8 +; NOSSE-NEXT: popl %ebx +; NOSSE-NEXT: .cfi_def_cfa_offset 4 +; NOSSE-NEXT: retl + %val = load atomic i64, i64* %ptr seq_cst, align 8 + ret i64 %val +} + +; Same as test2, but with noimplicitfloat. +define i64 @test3(i64* %ptr) noimplicitfloat { +; CHECK-LABEL: test3: ; CHECK: # %bb.0: ; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: .cfi_def_cfa_offset 8 diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll index 40f159b..529a288 100644 --- a/llvm/test/CodeGen/X86/atomic-non-integer.ll +++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll @@ -212,30 +212,13 @@ define float @load_float(float* %fptr) { define double @load_double(double* %fptr) { ; X86-SSE-LABEL: load_double: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebx -; X86-SSE-NEXT: .cfi_def_cfa_offset 8 -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: .cfi_def_cfa_offset 12 ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 24 -; X86-SSE-NEXT: .cfi_offset %esi, -12 -; X86-SSE-NEXT: .cfi_offset %ebx, -8 -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE-NEXT: xorl %eax, %eax -; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: xorl %ecx, %ecx -; X86-SSE-NEXT: xorl %ebx, %ebx -; X86-SSE-NEXT: lock cmpxchg8b (%esi) -; X86-SSE-NEXT: movd %edx, %xmm0 -; X86-SSE-NEXT: movd %eax, %xmm1 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE-NEXT: movq %xmm1, (%esp) +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movlps %xmm0, (%esp) ; X86-SSE-NEXT: fldl (%esp) ; X86-SSE-NEXT: addl $12, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 12 -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: .cfi_def_cfa_offset 8 -; X86-SSE-NEXT: popl %ebx ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 ; X86-SSE-NEXT: retl ; @@ -440,30 +423,13 @@ define float @load_float_seq_cst(float* %fptr) { define double @load_double_seq_cst(double* %fptr) { ; X86-SSE-LABEL: load_double_seq_cst: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebx -; X86-SSE-NEXT: .cfi_def_cfa_offset 8 -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: .cfi_def_cfa_offset 12 ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 24 -; X86-SSE-NEXT: .cfi_offset %esi, -12 -; X86-SSE-NEXT: .cfi_offset %ebx, -8 -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE-NEXT: xorl %eax, %eax -; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: xorl %ecx, %ecx -; X86-SSE-NEXT: xorl %ebx, %ebx -; X86-SSE-NEXT: lock cmpxchg8b (%esi) -; X86-SSE-NEXT: movd %edx, %xmm0 -; X86-SSE-NEXT: movd %eax, %xmm1 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE-NEXT: movq %xmm1, (%esp) +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movlps %xmm0, (%esp) ; X86-SSE-NEXT: fldl (%esp) ; X86-SSE-NEXT: addl $12, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 12 -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: .cfi_def_cfa_offset 8 -; X86-SSE-NEXT: popl %ebx ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 ; X86-SSE-NEXT: retl ; -- 2.7.4