From 59f4b3d3081535b61609f12ea5f638905616fcbc Mon Sep 17 00:00:00 2001 From: Qiu Chaofan Date: Tue, 23 Nov 2021 17:21:17 +0800 Subject: [PATCH] [PowerPC] Implement more fusion types for Power10 This implements the rest of Power10 instruction fusion pairs, according to user manual, including 'wide immediate', 'load compare', 'zero move' and 'SHA3 assist'. Only 'SHA3 assist' is enabled by default. Reviewed By: shchenz Differential Revision: https://reviews.llvm.org/D112912 --- llvm/lib/Target/PowerPC/PPC.td | 18 ++++++- llvm/lib/Target/PowerPC/PPCMacroFusion.cpp | 73 +++++++++++++++++++++++++++++ llvm/lib/Target/PowerPC/PPCMacroFusion.def | 75 ++++++++++++++++++++++++++++++ llvm/lib/Target/PowerPC/PPCSubtarget.cpp | 4 ++ llvm/lib/Target/PowerPC/PPCSubtarget.h | 8 ++++ llvm/test/CodeGen/PowerPC/macro-fusion.mir | 53 +++++++++++++++++++++ 6 files changed, 230 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td index a1ff20b..422bd11 100644 --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -203,6 +203,22 @@ def FeatureLogicalFusion : SubtargetFeature<"fuse-logical", "HasLogicalFusion", "true", "Target supports Logical Operations fusion", [FeatureFusion]>; +def FeatureSha3Fusion : + SubtargetFeature<"fuse-sha3", "HasSha3Fusion", "true", + "Target supports SHA3 assist fusion", + [FeatureFusion]>; +def FeatureCompareFusion: + SubtargetFeature<"fuse-cmp", "HasCompareFusion", "true", + "Target supports Comparison Operations fusion", + [FeatureFusion]>; +def FeatureWideImmFusion: + SubtargetFeature<"fuse-wideimm", "HasWideImmFusion", "true", + "Target supports Wide-Immediate fusion", + [FeatureFusion]>; +def FeatureZeroMoveFusion: + SubtargetFeature<"fuse-zeromove", "HasZeroMoveFusion", "true", + "Target supports move to SPR with branch fusion", + [FeatureFusion]>; def FeatureUnalignedFloats : SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess", "true", "CPU does not trap on unaligned FP access">; @@ -393,7 +409,7 @@ def ProcessorFeatures { // still exist with the exception of those we know are Power9 specific. list FusionFeatures = [ FeatureStoreFusion, FeatureAddLogicalFusion, FeatureLogicalAddFusion, - FeatureLogicalFusion, FeatureArithAddFusion + FeatureLogicalFusion, FeatureArithAddFusion, FeatureSha3Fusion, ]; list P10AdditionalFeatures = !listconcat(FusionFeatures, [ diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp b/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp index bdff510..9d5206f 100644 --- a/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp +++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp @@ -149,6 +149,79 @@ static bool checkOpConstraints(FusionFeature::FusionKind Kd, case FusionFeature::FK_SldiAdd: return (matchingImmOps(FirstMI, 2, 3) && matchingImmOps(FirstMI, 3, 60)) || (matchingImmOps(FirstMI, 2, 6) && matchingImmOps(FirstMI, 3, 57)); + + // rldicl rx, ra, 1, 0 - xor + case FusionFeature::FK_RotateLeftXor: + return matchingImmOps(FirstMI, 2, 1) && matchingImmOps(FirstMI, 3, 0); + + // rldicr rx, ra, 1, 63 - xor + case FusionFeature::FK_RotateRightXor: + return matchingImmOps(FirstMI, 2, 1) && matchingImmOps(FirstMI, 3, 63); + + // We actually use CMPW* and CMPD*, 'l' doesn't exist as an operand in instr. + + // { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpi 0,1,rx,{ 0,1,-1 } + // { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpli 0,L,rx,{ 0,1 } + case FusionFeature::FK_LoadCmp1: + // { ld,ldx } - cmpi 0,1,rx,{ 0,1,-1 } + // { ld,ldx } - cmpli 0,1,rx,{ 0,1 } + case FusionFeature::FK_LoadCmp2: { + const MachineOperand &BT = SecondMI.getOperand(0); + if (!BT.isReg() || + (!Register::isVirtualRegister(BT.getReg()) && BT.getReg() != PPC::CR0)) + return false; + if (SecondMI.getOpcode() == PPC::CMPDI && + matchingImmOps(SecondMI, 2, -1, 16)) + return true; + return matchingImmOps(SecondMI, 2, 0) || matchingImmOps(SecondMI, 2, 1); + } + + // { lha,lhax,lwa,lwax } - cmpi 0,L,rx,{ 0,1,-1 } + case FusionFeature::FK_LoadCmp3: { + const MachineOperand &BT = SecondMI.getOperand(0); + if (!BT.isReg() || + (!Register::isVirtualRegister(BT.getReg()) && BT.getReg() != PPC::CR0)) + return false; + return matchingImmOps(SecondMI, 2, 0) || matchingImmOps(SecondMI, 2, 1) || + matchingImmOps(SecondMI, 2, -1, 16); + } + + // mtctr - { bcctr,bcctrl } + case FusionFeature::FK_ZeroMoveCTR: + // ( mtctr rx ) is alias of ( mtspr 9, rx ) + return (FirstMI.getOpcode() != PPC::MTSPR && + FirstMI.getOpcode() != PPC::MTSPR8) || + matchingImmOps(FirstMI, 0, 9); + + // mtlr - { bclr,bclrl } + case FusionFeature::FK_ZeroMoveLR: + // ( mtlr rx ) is alias of ( mtspr 8, rx ) + return (FirstMI.getOpcode() != PPC::MTSPR && + FirstMI.getOpcode() != PPC::MTSPR8) || + matchingImmOps(FirstMI, 0, 8); + + // addis rx,ra,si - addi rt,rx,SI, SI >= 0 + case FusionFeature::FK_AddisAddi: { + const MachineOperand &RA = FirstMI.getOperand(1); + const MachineOperand &SI = SecondMI.getOperand(2); + if (!SI.isImm() || !RA.isReg()) + return false; + if (RA.getReg() == PPC::ZERO || RA.getReg() == PPC::ZERO8) + return false; + return SignExtend64(SI.getImm(), 16) >= 0; + } + + // addi rx,ra,si - addis rt,rx,SI, ra > 0, SI >= 2 + case FusionFeature::FK_AddiAddis: { + const MachineOperand &RA = FirstMI.getOperand(1); + const MachineOperand &SI = FirstMI.getOperand(2); + if (!SI.isImm() || !RA.isReg()) + return false; + if (RA.getReg() == PPC::ZERO || RA.getReg() == PPC::ZERO8) + return false; + int64_t ExtendedSI = SignExtend64(SI.getImm(), 16); + return ExtendedSI >= 2; + } } llvm_unreachable("All the cases should have been handled"); diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.def b/llvm/lib/Target/PowerPC/PPCMacroFusion.def index 469a248..e4954b7 100644 --- a/llvm/lib/Target/PowerPC/PPCMacroFusion.def +++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.def @@ -78,5 +78,80 @@ FUSION_FEATURE(VecLogical, hasLogicalFusion, -1, FUSION_FEATURE(SldiAdd, hasArithAddFusion, -1, FUSION_OP_SET(RLDICR, RLDICR_32), FUSION_OP_SET(ADD4, ADD8, SUBF, SUBF8)) +// rldicl rx, ra, 1, 0 - xor +FUSION_FEATURE(RotateLeftXor, hasSha3Fusion, 1, + FUSION_OP_SET(RLDICL, RLDICL_32, RLDICL_32_64), + FUSION_OP_SET(XOR, XOR8)) + +// rldicr rx, ra, 1, 63 - xor +FUSION_FEATURE(RotateRightXor, hasSha3Fusion, 1, + FUSION_OP_SET(RLDICR, RLDICR_32), FUSION_OP_SET(XOR, XOR8)) + +// There're two special cases in 'load-compare' series, so we have to split +// them into several pattern groups to fit into current framework. This can +// be clearer once we switched to a more expressive approach. + +// { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpi 0,1,rx,{ 0,1,-1 } +// { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpli 0,L,rx,{ 0,1 } +FUSION_FEATURE(LoadCmp1, hasCompareFusion, 1, + FUSION_OP_SET(LBZ, LBZ8, LBZX, LBZX8, LBZXTLS, LBZXTLS_, + LBZXTLS_32, LHZ, LHZ8, LHZX, LHZX8, LHZXTLS, + LHZXTLS_, LHZXTLS_32, LWZ, LWZ8, LWZX, LWZX8, + LWZXTLS, LWZXTLS_, LWZXTLS_32), + FUSION_OP_SET(CMPDI, CMPLDI, CMPLWI)) + +// { ld,ldx } - cmpi 0,1,rx,{ 0,1,-1 } +// { ld,ldx } - cmpli 0,1,rx,{ 0,1 } +FUSION_FEATURE(LoadCmp2, hasCompareFusion, 1, + FUSION_OP_SET(LD, LDX, LDXTLS, LDXTLS_), + FUSION_OP_SET(CMPDI, CMPLDI)) + +// { lha,lhax,lwa,lwax } - cmpi 0,L,rx,{ 0,1,-1 } +FUSION_FEATURE(LoadCmp3, hasCompareFusion, 1, + FUSION_OP_SET(LHA, LHA8, LHAX, LHAX8, LWA, LWA_32, LWAX, + LWAX_32), + FUSION_OP_SET(CMPLDI, CMPLWI)) + +// ori - oris +FUSION_FEATURE(OriOris, hasWideImmFusion, 1, FUSION_OP_SET(ORI, ORI8), + FUSION_OP_SET(ORIS, ORIS8)) + +// lis - ori +FUSION_FEATURE(LisOri, hasWideImmFusion, 1, FUSION_OP_SET(LIS, LIS8), + FUSION_OP_SET(ORI, ORI8)) + +// oris - ori +FUSION_FEATURE(OrisOri, hasWideImmFusion, 1, FUSION_OP_SET(ORIS, ORIS8), + FUSION_OP_SET(ORI, ORI8)) + +// xori - xoris +FUSION_FEATURE(XoriXoris, hasWideImmFusion, 1, FUSION_OP_SET(XORI, XORI8), + FUSION_OP_SET(XORIS, XORIS8)) + +// xoris - xori +FUSION_FEATURE(XorisXori, hasWideImmFusion, 1, FUSION_OP_SET(XORIS, XORIS8), + FUSION_OP_SET(XORI, XORI8)) + +// addis rx,ra,si - addi rt,rx,SI, SI >= 0 +FUSION_FEATURE(AddisAddi, hasWideImmFusion, 1, + FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8), + FUSION_OP_SET(ADDI, ADDI8, ADDItocL)) + +// addi rx,ra,si - addis rt,rx,SI, ra > 0, SI >= 2 +FUSION_FEATURE(AddiAddis, hasWideImmFusion, 1, + FUSION_OP_SET(ADDI, ADDI8, ADDItocL), + FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8)) + +// mtctr - { bcctr,bcctrl } +FUSION_FEATURE(ZeroMoveCTR, hasZeroMoveFusion, -1, + FUSION_OP_SET(MTCTR, MTCTRloop, MTSPR8, MTSPR), + FUSION_OP_SET(BCCTR, BCCTRn, BCCTR8, BCCTR8n, BCCTRL, BCCTRLn, + BCCTRL8, BCCTRL8n, gBCCTR, gBCCTRL)) + +// mtlr - { bclr,bclrl } +FUSION_FEATURE(ZeroMoveLR, hasZeroMoveFusion, -1, + FUSION_OP_SET(MTLR8, MTLR, MTSPR8, MTSPR), + FUSION_OP_SET(BCLR, BCLRn, gBCLR, BCLRL, BCLRLn, gBCLRL)) + #undef FUSION_FEATURE #undef FUSION_OP_SET diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index dfc29db..1258a12 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -131,6 +131,10 @@ void PPCSubtarget::initializeEnvironment() { HasAddLogicalFusion = false; HasLogicalAddFusion = false; HasLogicalFusion = false; + HasSha3Fusion = false; + HasCompareFusion = false; + HasWideImmFusion = false; + HasZeroMoveFusion = false; IsISA2_06 = false; IsISA2_07 = false; IsISA3_0 = false; diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h index 783ea12..d52833c 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -151,6 +151,10 @@ protected: bool HasAddLogicalFusion; bool HasLogicalAddFusion; bool HasLogicalFusion; + bool HasSha3Fusion; + bool HasCompareFusion; + bool HasWideImmFusion; + bool HasZeroMoveFusion; bool IsISA2_06; bool IsISA2_07; bool IsISA3_0; @@ -340,6 +344,10 @@ public: bool hasAddLogicalFusion() const { return HasAddLogicalFusion; } bool hasLogicalAddFusion() const { return HasLogicalAddFusion; } bool hasLogicalFusion() const { return HasLogicalFusion; } + bool hasCompareFusion() const { return HasCompareFusion; } + bool hasWideImmFusion() const { return HasWideImmFusion; } + bool hasSha3Fusion() const { return HasSha3Fusion; } + bool hasZeroMoveFusion() const { return HasZeroMoveFusion; } bool needsSwapsForVSXMemOps() const { return hasVSX() && isLittleEndian() && !hasP9Vector(); } diff --git a/llvm/test/CodeGen/PowerPC/macro-fusion.mir b/llvm/test/CodeGen/PowerPC/macro-fusion.mir index 16391a2..91c435d 100644 --- a/llvm/test/CodeGen/PowerPC/macro-fusion.mir +++ b/llvm/test/CodeGen/PowerPC/macro-fusion.mir @@ -1,6 +1,7 @@ # REQUIRES: asserts # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr10 -x=mir < %s \ # RUN: -debug-only=machine-scheduler -start-before=postmisched 2>&1 \ +# RUN: -mattr=+fuse-zeromove,+fuse-cmp,+fuse-wideimm \ # RUN: | FileCheck %s # CHECK: add_mulld:%bb.0 @@ -93,3 +94,55 @@ body: | renamable $x3 = ADD8 killed renamable $x4, $x5 BLR8 implicit $lr8, implicit $rm, implicit $x3 ... + +# CHECK: rldicl_xor:%bb.0 +# CHECK: Macro fuse: SU(0) - SU(1) / RLDICL - XOR8 +--- +name: rldicl_xor +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x3, $x4, $x5 + renamable $x4 = RLDICL $x3, 1, 0 + renamable $x3 = XOR8 killed renamable $x4, $x5 + BLR8 implicit $lr8, implicit $rm, implicit $x3 +... + +# CHECK: rldicr_xor:%bb.0 +# CHECK: Macro fuse: SU(0) - SU(1) / RLDICR - XOR8 +--- +name: rldicr_xor +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x3, $x4, $x5 + renamable $x4 = RLDICR $x3, 1, 63 + renamable $x3 = XOR8 killed renamable $x4, $x5 + BLR8 implicit $lr8, implicit $rm, implicit $x3 +... + +# CHECK: ori_oris:%bb.0 +# CHECK: Macro fuse: SU(0) - SU(1) / ORI8 - ORIS8 +--- +name: ori_oris +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x3, $x4 + renamable $x4 = ORI8 $x3, 63 + renamable $x3 = ORIS8 killed renamable $x4, 20 + BLR8 implicit $lr8, implicit $rm, implicit $x3 +... + +# CHECK: load_cmp:%bb.0 +# CHECK: Macro fuse: SU(0) - SU(1) / LD - CMPDI +--- +name: load_cmp +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x3, $x4, $x5 + renamable $x3 = LD 0, killed renamable $x3 + renamable $cr0 = CMPDI killed renamable $x3, 0 + renamable $x3 = ISEL8 killed renamable $x5, killed renamable $x4, renamable $cr0lt, implicit killed $cr0 + BLR8 implicit $lr8, implicit $rm, implicit $x3 -- 2.7.4