AArch64: Macrofusion: Split features, add missing combinations.
authorMatthias Braun <matze@braunis.de>
Tue, 4 Oct 2016 19:28:21 +0000 (19:28 +0000)
committerMatthias Braun <matze@braunis.de>
Tue, 4 Oct 2016 19:28:21 +0000 (19:28 +0000)
AArch64InstrInfo::shouldScheduleAdjacent() determines whether two
instruction can benefit from macroop fusion on apple CPUs. The list
turned out to be incomplete:
- the "rr" variants of the instructions were missing
- even the "rs" variants can have shift value == 0 and behave like the
  "rr" variants

This also splits the MacropFusion target feature into
ArithmeticBccFusion and ArithmeticCbzFusion.

Differential Revision: https://reviews.llvm.org/D25142

llvm-svn: 283243

llvm/lib/Target/AArch64/AArch64.td
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
llvm/lib/Target/AArch64/AArch64Subtarget.h
llvm/test/CodeGen/AArch64/misched-fusion.ll

index 5c66748..2ff3cf4 100644 (file)
@@ -94,9 +94,13 @@ def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature<
     "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern",
     "true", "Use alternative pattern for sextload convert to f32">;
 
-def FeatureMacroOpFusion : SubtargetFeature<
-    "macroop-fusion", "HasMacroOpFusion", "true",
-    "CPU supports macro op fusion">;
+def FeatureArithmeticBccFusion : SubtargetFeature<
+    "arith-bcc-fusion", "HasArithmeticBccFusion", "true",
+    "CPU fuses arithmetic+bcc operations">;
+
+def FeatureArithmeticCbzFusion : SubtargetFeature<
+    "arith-cbz-fusion", "HasArithmeticCbzFusion", "true",
+    "CPU fuses arithmetic + cbz/cbnz operations">;
 
 def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
     "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
@@ -204,7 +208,8 @@ def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
                                    FeatureCrypto,
                                    FeatureDisableLatencySchedHeuristic,
                                    FeatureFPARMv8,
-                                   FeatureMacroOpFusion,
+                                   FeatureArithmeticBccFusion,
+                                   FeatureArithmeticCbzFusion,
                                    FeatureNEON,
                                    FeaturePerfMon,
                                    FeatureSlowMisaligned128Store,
@@ -244,7 +249,7 @@ def ProcVulcan  : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan",
                                    FeatureCRC,
                                    FeatureCrypto,
                                    FeatureFPARMv8,
-                                   FeatureMacroOpFusion,
+                                   FeatureArithmeticBccFusion,
                                    FeatureNEON,
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
index 859f782..b26dbce 100644 (file)
@@ -1876,39 +1876,80 @@ bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
 
 bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr &First,
                                               MachineInstr &Second) const {
-  if (Subtarget.hasMacroOpFusion()) {
+  if (Subtarget.hasArithmeticBccFusion()) {
     // Fuse CMN, CMP, TST followed by Bcc.
     unsigned SecondOpcode = Second.getOpcode();
     if (SecondOpcode == AArch64::Bcc) {
       switch (First.getOpcode()) {
       default:
         return false;
-      case AArch64::SUBSWri:
       case AArch64::ADDSWri:
-      case AArch64::ANDSWri:
-      case AArch64::SUBSXri:
+      case AArch64::ADDSWrr:
       case AArch64::ADDSXri:
+      case AArch64::ADDSXrr:
+      case AArch64::ANDSWri:
+      case AArch64::ANDSWrr:
       case AArch64::ANDSXri:
+      case AArch64::ANDSXrr:
+      case AArch64::SUBSWri:
+      case AArch64::SUBSWrr:
+      case AArch64::SUBSXri:
+      case AArch64::SUBSXrr:
+      case AArch64::BICSWrr:
+      case AArch64::BICSXrr:
         return true;
+      case AArch64::ADDSWrs:
+      case AArch64::ADDSXrs:
+      case AArch64::ANDSWrs:
+      case AArch64::ANDSXrs:
+      case AArch64::SUBSWrs:
+      case AArch64::SUBSXrs:
+      case AArch64::BICSWrs:
+      case AArch64::BICSXrs:
+        // Shift value can be 0 making these behave like the "rr" variant...
+        return !hasShiftedReg(Second);
       }
     }
+  }
+  if (Subtarget.hasArithmeticCbzFusion()) {
     // Fuse ALU operations followed by CBZ/CBNZ.
+    unsigned SecondOpcode = Second.getOpcode();
     if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
         SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
       switch (First.getOpcode()) {
       default:
         return false;
       case AArch64::ADDWri:
+      case AArch64::ADDWrr:
       case AArch64::ADDXri:
+      case AArch64::ADDXrr:
       case AArch64::ANDWri:
+      case AArch64::ANDWrr:
       case AArch64::ANDXri:
+      case AArch64::ANDXrr:
       case AArch64::EORWri:
+      case AArch64::EORWrr:
       case AArch64::EORXri:
+      case AArch64::EORXrr:
       case AArch64::ORRWri:
+      case AArch64::ORRWrr:
       case AArch64::ORRXri:
+      case AArch64::ORRXrr:
       case AArch64::SUBWri:
+      case AArch64::SUBWrr:
       case AArch64::SUBXri:
+      case AArch64::SUBXrr:
         return true;
+      case AArch64::ADDWrs:
+      case AArch64::ADDXrs:
+      case AArch64::ANDWrs:
+      case AArch64::ANDXrs:
+      case AArch64::SUBWrs:
+      case AArch64::SUBXrs:
+      case AArch64::BICWrs:
+      case AArch64::BICXrs:
+        // Shift value can be 0 making these behave like the "rr" variant...
+        return !hasShiftedReg(Second);
       }
     }
   }
index 9f51c6b..a21dbd8 100644 (file)
@@ -80,7 +80,8 @@ protected:
   bool Misaligned128StoreIsSlow = false;
   bool AvoidQuadLdStPairs = false;
   bool UseAlternateSExtLoadCVTF32Pattern = false;
-  bool HasMacroOpFusion = false;
+  bool HasArithmeticBccFusion = false;
+  bool HasArithmeticCbzFusion = false;
   bool DisableLatencySchedHeuristic = false;
   uint8_t MaxInterleaveFactor = 2;
   uint8_t VectorInsertExtractBaseCost = 3;
@@ -188,7 +189,8 @@ public:
   bool useAlternateSExtLoadCVTF32Pattern() const {
     return UseAlternateSExtLoadCVTF32Pattern;
   }
-  bool hasMacroOpFusion() const { return HasMacroOpFusion; }
+  bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; }
+  bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; }
   unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
   unsigned getVectorInsertExtractBaseCost() const {
     return VectorInsertExtractBaseCost;
index 0f4c0ac..d5140c6 100644 (file)
@@ -1,4 +1,4 @@
-; RUN: llc -o - %s -mattr=+macroop-fusion,+use-postra-scheduler | FileCheck %s
+; RUN: llc -o - %s -mattr=+arith-cbz-fusion,+use-postra-scheduler | FileCheck %s
 ; RUN: llc -o - %s -mcpu=cyclone | FileCheck %s
 
 target triple = "arm64-apple-ios"