From 3bef27cc6f42c01f56cf0b6876b7cf0d3c10ba6d Mon Sep 17 00:00:00 2001 From: Tim Shen Date: Tue, 16 May 2017 20:18:06 +0000 Subject: [PATCH] [PPC] Lower load acquire/seq_cst trailing fence to cmp + bne + isync. Summary: This fixes pr32392. The lowering pipeline is: llvm.ppc.cfence in IR -> PPC::CFENCE8 in isel -> Actual instructions in expandPostRAPseudo. The reason why expandPostRAPseudo is chosen is because previous passes are likely eliminating instructions like cmpw 3, 3 (early CSE) and bne- 7, .+4 (some branch pass(s)). Differential Revision: https://reviews.llvm.org/D32763 llvm-svn: 303205 --- llvm/include/llvm/IR/IntrinsicsPowerPC.td | 2 + llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 45 +++++++++++++++-- llvm/lib/Target/PowerPC/PPCISelLowering.h | 1 + llvm/lib/Target/PowerPC/PPCInstr64Bit.td | 4 ++ llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 13 +++++ llvm/lib/Target/PowerPC/PPCInstrInfo.td | 12 +++-- llvm/test/CodeGen/PowerPC/atomic-2.ll | 6 ++- llvm/test/CodeGen/PowerPC/atomics-indexed.ll | 14 ++++-- llvm/test/CodeGen/PowerPC/atomics-regression.ll | 64 +++++++++++++++++++++---- llvm/test/CodeGen/PowerPC/atomics.ll | 14 ++++-- 10 files changed, 149 insertions(+), 26 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td index 64240a9..6321bb8 100644 --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -1132,4 +1132,6 @@ def int_ppc_tsuspend : GCCBuiltin<"__builtin_tsuspend">, def int_ppc_ttest : GCCBuiltin<"__builtin_ttest">, Intrinsic<[llvm_i64_ty], [], []>; + +def int_ppc_cfence : Intrinsic<[], [llvm_anyint_ty], []>; } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 17bdd59..144aea8 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -410,6 +410,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // To handle counter-based loop conditions. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + // Comparisons that require checking two conditions. setCondCodeAction(ISD::SETULT, MVT::f32, Expand); setCondCodeAction(ISD::SETULT, MVT::f64, Expand); @@ -8184,6 +8189,26 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return Flags; } +SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op, + SelectionDAG &DAG) const { + // SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to + // the beginning of the argument list. + int ArgStart = isa(Op.getOperand(0)) ? 0 : 1; + SDLoc DL(Op); + switch (cast(Op.getOperand(ArgStart))->getZExtValue()) { + case Intrinsic::ppc_cfence: { + assert(Subtarget.isPPC64() && "Only 64-bit is supported for now."); + return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other, + DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, + Op.getOperand(ArgStart + 1))), + 0); + } + default: + break; + } + return SDValue(); +} + SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -8649,6 +8674,9 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { // Frame & Return address. case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); + + case ISD::INTRINSIC_VOID: + return LowerINTRINSIC_VOID(Op, DAG); } } @@ -8753,12 +8781,19 @@ Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const { - if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) + if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) { + // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and + // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html + // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. + if (isa(Inst) && Subtarget.isPPC64()) + return Builder.CreateCall( + Intrinsic::getDeclaration( + Builder.GetInsertBlock()->getParent()->getParent(), + Intrinsic::ppc_cfence, {Inst->getType()}), + {Inst}); + // FIXME: Can use isync for rmw operation. return callIntrinsic(Builder, Intrinsic::ppc_lwsync); - // FIXME: this is too conservative, a dependent branch + isync is enough. - // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and - // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html - // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. + } return nullptr; } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 4fc7442..acb7794 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -905,6 +905,7 @@ namespace llvm { SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index a843391..a3f894c 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -983,6 +983,10 @@ def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg), [(set i64:$rD, (PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>, isPPC64; + +let isBarrier = 1, isPseudo = 1, Defs = [CR7], Itinerary = IIC_LdStSync in +def CFENCE8 : Pseudo<(outs), (ins g8rc:$cr), "#CFENCE8", []>; + def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g), (ADD8TLS $in, tglobaltlsaddr:$g)>; def ADDIStlsgdHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 790a890..3afcec1 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1873,6 +1873,8 @@ PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { } bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { + auto &MBB = *MI.getParent(); + auto DL = MI.getDebugLoc(); switch (MI.getOpcode()) { case TargetOpcode::LOAD_STACK_GUARD: { assert(Subtarget.isTargetLinux() && @@ -1920,6 +1922,17 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(Opcode)); return true; } + case PPC::CFENCE8: { + auto Val = MI.getOperand(0).getReg(); + BuildMI(MBB, MI, DL, get(PPC::CMPW), PPC::CR7).addReg(Val).addReg(Val); + BuildMI(MBB, MI, DL, get(PPC::CTRL_DEP)) + .addImm(PPC::PRED_NE_MINUS) + .addReg(PPC::CR7) + .addImm(1); + MI.setDesc(get(PPC::ISYNC)); + MI.RemoveOperand(0); + return true; + } } return false; } diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 1af5e7f..0766cfe 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -1223,9 +1223,15 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in { // FIXME: should be able to write a pattern for PPCcondbranch, but can't use // a two-value operand where a dag node expects two operands. :( let isCodeGenOnly = 1 in { - def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst), - "b${cond:cc}${cond:pm} ${cond:reg}, $dst" - /*[(PPCcondbranch crrc:$crS, imm:$opc, bb:$dst)]*/>; + class BCC_class : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst), + "b${cond:cc}${cond:pm} ${cond:reg}, $dst" + /*[(PPCcondbranch crrc:$crS, imm:$opc, bb:$dst)]*/>; + def BCC : BCC_class; + + // The same as BCC, except that it's not a terminator. Used for introducing + // control flow dependency without creating new blocks. + let isTerminator = 0 in def CTRL_DEP : BCC_class; + def BCCA : BForm<16, 1, 0, (outs), (ins pred:$cond, abscondbrtarget:$dst), "b${cond:cc}a${cond:pm} ${cond:reg}, $dst">; diff --git a/llvm/test/CodeGen/PowerPC/atomic-2.ll b/llvm/test/CodeGen/PowerPC/atomic-2.ll index 18715dd..f61ca3d 100644 --- a/llvm/test/CodeGen/PowerPC/atomic-2.ll +++ b/llvm/test/CodeGen/PowerPC/atomic-2.ll @@ -108,8 +108,10 @@ entry: ; CHECK: @atomic_load %tmp = load atomic i64, i64* %mem acquire, align 64 ; CHECK-NOT: ldarx -; CHECK: ld -; CHECK: lwsync +; CHECK: ld [[VAL:[0-9]+]] +; CHECK: cmpw [[CR:[0-9]+]], [[VAL]], [[VAL]] +; CHECK: bne- [[CR]], .+4 +; CHECK: isync ret i64 %tmp } diff --git a/llvm/test/CodeGen/PowerPC/atomics-indexed.ll b/llvm/test/CodeGen/PowerPC/atomics-indexed.ll index 7a0dde0..cfe15f0 100644 --- a/llvm/test/CodeGen/PowerPC/atomics-indexed.ll +++ b/llvm/test/CodeGen/PowerPC/atomics-indexed.ll @@ -10,16 +10,22 @@ define i8 @load_x_i8_seq_cst([100000 x i8]* %mem) { ; CHECK-LABEL: load_x_i8_seq_cst ; CHECK: sync -; CHECK: lbzx -; CHECK: lwsync +; CHECK: lbzx [[VAL:r[0-9]+]] +; CHECK-PPC32: lwsync +; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]] +; CHECK-PPC64: bne- [[CR]], .+4 +; CHECK-PPC64: isync %ptr = getelementptr inbounds [100000 x i8], [100000 x i8]* %mem, i64 0, i64 90000 %val = load atomic i8, i8* %ptr seq_cst, align 1 ret i8 %val } define i16 @load_x_i16_acquire([100000 x i16]* %mem) { ; CHECK-LABEL: load_x_i16_acquire -; CHECK: lhzx -; CHECK: lwsync +; CHECK: lhzx [[VAL:r[0-9]+]] +; CHECK-PPC32: lwsync +; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]] +; CHECK-PPC64: bne- [[CR]], .+4 +; CHECK-PPC64: isync %ptr = getelementptr inbounds [100000 x i16], [100000 x i16]* %mem, i64 0, i64 90000 %val = load atomic i16, i16* %ptr acquire, align 2 ret i16 %val diff --git a/llvm/test/CodeGen/PowerPC/atomics-regression.ll b/llvm/test/CodeGen/PowerPC/atomics-regression.ll index 9af82b6..054d3a4 100644 --- a/llvm/test/CodeGen/PowerPC/atomics-regression.ll +++ b/llvm/test/CodeGen/PowerPC/atomics-regression.ll @@ -23,7 +23,9 @@ define i8 @test2(i8* %ptr) { ; PPC64LE-LABEL: test2: ; PPC64LE: # BB#0: ; PPC64LE-NEXT: lbz 3, 0(3) -; PPC64LE-NEXT: lwsync +; PPC64LE-NEXT: cmpw 7, 3, 3 +; PPC64LE-NEXT: bne- 7, .+4 +; PPC64LE-NEXT: isync ; PPC64LE-NEXT: blr %val = load atomic i8, i8* %ptr acquire, align 1 ret i8 %val @@ -35,7 +37,9 @@ define i8 @test3(i8* %ptr) { ; PPC64LE-NEXT: sync ; PPC64LE-NEXT: ori 2, 2, 0 ; PPC64LE-NEXT: lbz 3, 0(3) -; PPC64LE-NEXT: lwsync +; PPC64LE-NEXT: cmpw 7, 3, 3 +; PPC64LE-NEXT: bne- 7, .+4 +; PPC64LE-NEXT: isync ; PPC64LE-NEXT: blr %val = load atomic i8, i8* %ptr seq_cst, align 1 ret i8 %val @@ -63,7 +67,9 @@ define i16 @test6(i16* %ptr) { ; PPC64LE-LABEL: test6: ; PPC64LE: # BB#0: ; PPC64LE-NEXT: lhz 3, 0(3) -; PPC64LE-NEXT: lwsync +; PPC64LE-NEXT: cmpw 7, 3, 3 +; PPC64LE-NEXT: bne- 7, .+4 +; PPC64LE-NEXT: isync ; PPC64LE-NEXT: blr %val = load atomic i16, i16* %ptr acquire, align 2 ret i16 %val @@ -75,7 +81,9 @@ define i16 @test7(i16* %ptr) { ; PPC64LE-NEXT: sync ; PPC64LE-NEXT: ori 2, 2, 0 ; PPC64LE-NEXT: lhz 3, 0(3) -; PPC64LE-NEXT: lwsync +; PPC64LE-NEXT: cmpw 7, 3, 3 +; PPC64LE-NEXT: bne- 7, .+4 +; PPC64LE-NEXT: isync ; PPC64LE-NEXT: blr %val = load atomic i16, i16* %ptr seq_cst, align 2 ret i16 %val @@ -103,7 +111,9 @@ define i32 @test10(i32* %ptr) { ; PPC64LE-LABEL: test10: ; PPC64LE: # BB#0: ; PPC64LE-NEXT: lwz 3, 0(3) -; PPC64LE-NEXT: lwsync +; PPC64LE-NEXT: cmpw 7, 3, 3 +; PPC64LE-NEXT: bne- 7, .+4 +; PPC64LE-NEXT: isync ; PPC64LE-NEXT: blr %val = load atomic i32, i32* %ptr acquire, align 4 ret i32 %val @@ -115,7 +125,9 @@ define i32 @test11(i32* %ptr) { ; PPC64LE-NEXT: sync ; PPC64LE-NEXT: ori 2, 2, 0 ; PPC64LE-NEXT: lwz 3, 0(3) -; PPC64LE-NEXT: lwsync +; PPC64LE-NEXT: cmpw 7, 3, 3 +; PPC64LE-NEXT: bne- 7, .+4 +; PPC64LE-NEXT: isync ; PPC64LE-NEXT: blr %val = load atomic i32, i32* %ptr seq_cst, align 4 ret i32 %val @@ -143,7 +155,9 @@ define i64 @test14(i64* %ptr) { ; PPC64LE-LABEL: test14: ; PPC64LE: # BB#0: ; PPC64LE-NEXT: ld 3, 0(3) -; PPC64LE-NEXT: lwsync +; PPC64LE-NEXT: cmpw 7, 3, 3 +; PPC64LE-NEXT: bne- 7, .+4 +; PPC64LE-NEXT: isync ; PPC64LE-NEXT: blr %val = load atomic i64, i64* %ptr acquire, align 8 ret i64 %val @@ -155,7 +169,9 @@ define i64 @test15(i64* %ptr) { ; PPC64LE-NEXT: sync ; PPC64LE-NEXT: ori 2, 2, 0 ; PPC64LE-NEXT: ld 3, 0(3) -; PPC64LE-NEXT: lwsync +; PPC64LE-NEXT: cmpw 7, 3, 3 +; PPC64LE-NEXT: bne- 7, .+4 +; PPC64LE-NEXT: isync ; PPC64LE-NEXT: blr %val = load atomic i64, i64* %ptr seq_cst, align 8 ret i64 %val @@ -9544,3 +9560,35 @@ define i64 @test559(i64* %ptr, i64 %val) { %ret = atomicrmw umin i64* %ptr, i64 %val singlethread seq_cst ret i64 %ret } + +; The second load should never be scheduled before isync. +define i32 @test_ordering0(i32* %ptr1, i32* %ptr2) { +; PPC64LE-LABEL: test_ordering0: +; PPC64LE: # BB#0: +; PPC64LE-NEXT: lwz 4, 0(3) +; PPC64LE-NEXT: cmpw 7, 4, 4 +; PPC64LE-NEXT: bne- 7, .+4 +; PPC64LE-NEXT: isync +; PPC64LE-NEXT: lwz 3, 0(3) +; PPC64LE-NEXT: add 3, 4, 3 +; PPC64LE-NEXT: blr + %val1 = load atomic i32, i32* %ptr1 acquire, align 4 + %val2 = load i32, i32* %ptr1 + %add = add i32 %val1, %val2 + ret i32 %add +} + +; The second store should never be scheduled before isync. +define i32 @test_ordering1(i32* %ptr1, i32 %val1, i32* %ptr2) { +; PPC64LE-LABEL: test_ordering1: +; PPC64LE: # BB#0: +; PPC64LE-NEXT: lwz 3, 0(3) +; PPC64LE-NEXT: cmpw 7, 3, 3 +; PPC64LE-NEXT: bne- 7, .+4 +; PPC64LE-NEXT: isync +; PPC64LE-NEXT: stw 4, 0(5) +; PPC64LE-NEXT: blr + %val2 = load atomic i32, i32* %ptr1 acquire, align 4 + store i32 %val1, i32* %ptr2 + ret i32 %val2 +} diff --git a/llvm/test/CodeGen/PowerPC/atomics.ll b/llvm/test/CodeGen/PowerPC/atomics.ll index 2e1eff0..61d5453 100644 --- a/llvm/test/CodeGen/PowerPC/atomics.ll +++ b/llvm/test/CodeGen/PowerPC/atomics.ll @@ -25,9 +25,12 @@ define i16 @load_i16_monotonic(i16* %mem) { } define i32 @load_i32_acquire(i32* %mem) { ; CHECK-LABEL: load_i32_acquire -; CHECK: lwz +; CHECK: lwz [[VAL:r[0-9]+]] %val = load atomic i32, i32* %mem acquire, align 4 -; CHECK: lwsync +; CHECK-PPC32: lwsync +; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]] +; CHECK-PPC64: bne- [[CR]], .+4 +; CHECK-PPC64: isync ret i32 %val } define i64 @load_i64_seq_cst(i64* %mem) { @@ -35,9 +38,12 @@ define i64 @load_i64_seq_cst(i64* %mem) { ; CHECK: sync ; PPC32: __sync_ ; PPC64-NOT: __sync_ -; PPC64: ld +; PPC64: ld [[VAL:r[0-9]+]] %val = load atomic i64, i64* %mem seq_cst, align 8 -; CHECK: lwsync +; CHECK-PPC32: lwsync +; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]] +; CHECK-PPC64: bne- [[CR]], .+4 +; CHECK-PPC64: isync ret i64 %val } -- 2.7.4