From 53f0d41dc4879cf79bea8acc4aa87064a085d05c Mon Sep 17 00:00:00 2001 From: Evandro Menezes Date: Mon, 10 Dec 2018 17:17:26 +0000 Subject: [PATCH] [AArch64] Refactor the Exynos scheduling predicates Refactor the scheduling predicates based on `MCInstPredicate`. In this case, for the Exynos processors. Differential revision: https://reviews.llvm.org/D55345 llvm-svn: 348774 --- llvm/include/llvm/Target/TargetInstrPredicate.td | 2 +- llvm/lib/Target/AArch64/AArch64.td | 1 + llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 209 +-------------------- llvm/lib/Target/AArch64/AArch64InstrInfo.h | 9 - llvm/lib/Target/AArch64/AArch64SchedExynosM1.td | 122 ++++++------ llvm/lib/Target/AArch64/AArch64SchedExynosM3.td | 113 +++++------ llvm/lib/Target/AArch64/AArch64SchedPredExynos.td | 124 ++++++++++++ .../llvm-mca/AArch64/Exynos/extended-register.s | 44 ++--- .../llvm-mca/AArch64/Exynos/register-offset.s | 29 ++- .../llvm-mca/AArch64/Exynos/shifted-register.s | 24 +-- 10 files changed, 297 insertions(+), 380 deletions(-) create mode 100644 llvm/lib/Target/AArch64/AArch64SchedPredExynos.td diff --git a/llvm/include/llvm/Target/TargetInstrPredicate.td b/llvm/include/llvm/Target/TargetInstrPredicate.td index e70da00..4b2c57b 100644 --- a/llvm/include/llvm/Target/TargetInstrPredicate.td +++ b/llvm/include/llvm/Target/TargetInstrPredicate.td @@ -39,7 +39,7 @@ // processor scheduling model. // // The `MCInstPredicateExample` definition above is equivalent (and therefore -// could replace) the following definition from the ExynosM3 model (see +// could replace) the following definition from a previous ExynosM3 model (see // AArch64SchedExynosM3.td): // // def M3BranchLinkFastPred : SchedPredicate<[{ diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index f914735..a13f6c8 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -371,6 +371,7 @@ include "AArch64CallingConvention.td" include "AArch64Schedule.td" include "AArch64InstrInfo.td" include "AArch64SchedPredicates.td" +include "AArch64SchedPredExynos.td" def AArch64InstrInfo : InstrInfo; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index e3e4625..eddb349 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -705,7 +705,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { // Secondly, check cases specific to sub-targets. if (Subtarget.hasExynosCheapAsMoveHandling()) { - if (isExynosResetFast(MI) || isExynosShiftExtFast(MI)) + if (isExynosCheapAsMove(MI)) return true; return MI.isAsCheapAsAMove(); @@ -759,213 +759,6 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { llvm_unreachable("Unknown opcode to check as cheap as a move!"); } -bool AArch64InstrInfo::isExynosResetFast(const MachineInstr &MI) { - unsigned Reg, Imm, Shift; - - switch (MI.getOpcode()) { - default: - return false; - - // MOV Rd, SP - case AArch64::ADDWri: - case AArch64::ADDXri: - if (!MI.getOperand(1).isReg() || !MI.getOperand(2).isImm()) - return false; - - Reg = MI.getOperand(1).getReg(); - Imm = MI.getOperand(2).getImm(); - return ((Reg == AArch64::WSP || Reg == AArch64::SP) && Imm == 0); - - // Literal - case AArch64::ADR: - case AArch64::ADRP: - return true; - - // MOVI Vd, #0 - case AArch64::MOVID: - case AArch64::MOVIv8b_ns: - case AArch64::MOVIv2d_ns: - case AArch64::MOVIv16b_ns: - Imm = MI.getOperand(1).getImm(); - return (Imm == 0); - - // MOVI Vd, #0 - case AArch64::MOVIv2i32: - case AArch64::MOVIv4i16: - case AArch64::MOVIv4i32: - case AArch64::MOVIv8i16: - Imm = MI.getOperand(1).getImm(); - Shift = MI.getOperand(2).getImm(); - return (Imm == 0 && Shift == 0); - - // MOV Rd, Imm - case AArch64::MOVNWi: - case AArch64::MOVNXi: - - // MOV Rd, Imm - case AArch64::MOVZWi: - case AArch64::MOVZXi: - return true; - - // MOV Rd, Imm - case AArch64::ORRWri: - case AArch64::ORRXri: - if (!MI.getOperand(1).isReg()) - return false; - - Reg = MI.getOperand(1).getReg(); - Imm = MI.getOperand(2).getImm(); - return ((Reg == AArch64::WZR || Reg == AArch64::XZR) && Imm == 0); - - // MOV Rd, Rm - case AArch64::ORRWrs: - case AArch64::ORRXrs: - if (!MI.getOperand(1).isReg()) - return false; - - Reg = MI.getOperand(1).getReg(); - Imm = MI.getOperand(3).getImm(); - Shift = AArch64_AM::getShiftValue(Imm); - return ((Reg == AArch64::WZR || Reg == AArch64::XZR) && Shift == 0); - } -} - -bool AArch64InstrInfo::isExynosLdStExtFast(const MachineInstr &MI) { - unsigned Imm; - AArch64_AM::ShiftExtendType Ext; - - switch (MI.getOpcode()) { - default: - return false; - - // WriteLD - case AArch64::PRFMroW: - case AArch64::PRFMroX: - - // WriteLDIdx - case AArch64::LDRBBroW: - case AArch64::LDRBBroX: - case AArch64::LDRHHroW: - case AArch64::LDRHHroX: - case AArch64::LDRSBWroW: - case AArch64::LDRSBWroX: - case AArch64::LDRSBXroW: - case AArch64::LDRSBXroX: - case AArch64::LDRSHWroW: - case AArch64::LDRSHWroX: - case AArch64::LDRSHXroW: - case AArch64::LDRSHXroX: - case AArch64::LDRSWroW: - case AArch64::LDRSWroX: - case AArch64::LDRWroW: - case AArch64::LDRWroX: - case AArch64::LDRXroW: - case AArch64::LDRXroX: - - case AArch64::LDRBroW: - case AArch64::LDRBroX: - case AArch64::LDRDroW: - case AArch64::LDRDroX: - case AArch64::LDRHroW: - case AArch64::LDRHroX: - case AArch64::LDRSroW: - case AArch64::LDRSroX: - - // WriteSTIdx - case AArch64::STRBBroW: - case AArch64::STRBBroX: - case AArch64::STRHHroW: - case AArch64::STRHHroX: - case AArch64::STRWroW: - case AArch64::STRWroX: - case AArch64::STRXroW: - case AArch64::STRXroX: - - case AArch64::STRBroW: - case AArch64::STRBroX: - case AArch64::STRDroW: - case AArch64::STRDroX: - case AArch64::STRHroW: - case AArch64::STRHroX: - case AArch64::STRSroW: - case AArch64::STRSroX: - Imm = MI.getOperand(3).getImm(); - Ext = AArch64_AM::getMemExtendType(Imm); - return (Ext == AArch64_AM::SXTX || Ext == AArch64_AM::UXTX); - } -} - -bool AArch64InstrInfo::isExynosShiftExtFast(const MachineInstr &MI) { - unsigned Imm, Shift; - AArch64_AM::ShiftExtendType Ext = AArch64_AM::UXTX; - - switch (MI.getOpcode()) { - default: - return false; - - // WriteI - case AArch64::ADDSWri: - case AArch64::ADDSXri: - case AArch64::ADDWri: - case AArch64::ADDXri: - case AArch64::SUBSWri: - case AArch64::SUBSXri: - case AArch64::SUBWri: - case AArch64::SUBXri: - return true; - - // WriteISReg - case AArch64::ADDSWrs: - case AArch64::ADDSXrs: - case AArch64::ADDWrs: - case AArch64::ADDXrs: - case AArch64::ANDSWrs: - case AArch64::ANDSXrs: - case AArch64::ANDWrs: - case AArch64::ANDXrs: - case AArch64::BICSWrs: - case AArch64::BICSXrs: - case AArch64::BICWrs: - case AArch64::BICXrs: - case AArch64::EONWrs: - case AArch64::EONXrs: - case AArch64::EORWrs: - case AArch64::EORXrs: - case AArch64::ORNWrs: - case AArch64::ORNXrs: - case AArch64::ORRWrs: - case AArch64::ORRXrs: - case AArch64::SUBSWrs: - case AArch64::SUBSXrs: - case AArch64::SUBWrs: - case AArch64::SUBXrs: - Imm = MI.getOperand(3).getImm(); - Shift = AArch64_AM::getShiftValue(Imm); - Ext = AArch64_AM::getShiftType(Imm); - return (Shift == 0 || (Shift <= 3 && Ext == AArch64_AM::LSL)); - - // WriteIEReg - case AArch64::ADDSWrx: - case AArch64::ADDSXrx: - case AArch64::ADDWrx: - case AArch64::ADDXrx: - case AArch64::SUBSWrx: - case AArch64::SUBSXrx: - case AArch64::SUBWrx: - case AArch64::SUBXrx: - Ext = AArch64_AM::UXTW; - LLVM_FALLTHROUGH; - case AArch64::ADDSXrx64: - case AArch64::ADDXrx64: - case AArch64::SUBSXrx64: - case AArch64::SUBXrx64: - Imm = MI.getOperand(3).getImm(); - Shift = AArch64_AM::getArithShiftValue(Imm); - return (Shift == 0 || - (Shift <= 3 && Ext == AArch64_AM::getArithExtendType(Imm))); - } -} - bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) { switch (MI.getOpcode()) { default: diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index f5be3ae..9954669 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -241,15 +241,6 @@ public: MachineBasicBlock::iterator &It, MachineFunction &MF, const outliner::Candidate &C) const override; bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override; - /// Returns true if the instruction sets a constant value that can be - /// executed more efficiently. - static bool isExynosResetFast(const MachineInstr &MI); - /// Returns true if the load or store has an extension that can be executed - /// more efficiently. - static bool isExynosLdStExtFast(const MachineInstr &MI); - /// Returns true if the instruction has a constant shift left or extension - /// that can be executed more efficiently. - static bool isExynosShiftExtFast(const MachineInstr &MI); /// Returns true if the instruction has a shift by immediate that can be /// executed in one cycle less. static bool isFalkorShiftExtFast(const MachineInstr &MI); diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM1.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM1.td index d566a13..62a4650 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM1.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM1.td @@ -62,14 +62,6 @@ def M1UnitNALU : ProcResGroup<[M1UnitNAL0, M1UnitNAL1]>; // All simple vector //===----------------------------------------------------------------------===// -// Predicates. - -def M1BranchLinkPred : SchedPredicate<[{MI->getOpcode() == AArch64::BLR && - MI->getOperand(0).getReg() != AArch64::LR}]>; -def M1LdStExtPred : SchedPredicate<[{TII->isExynosLdStExtFast(*MI)}]>; -def M1ShiftExtPred : SchedPredicate<[{TII->isExynosShiftExtFast(*MI)}]>; - -//===----------------------------------------------------------------------===// // Coarse scheduling model. def M1WriteA1 : SchedWriteRes<[M1UnitALU]> { let Latency = 1; } @@ -86,14 +78,16 @@ def M1WriteAC : SchedWriteRes<[M1UnitALU, def M1WriteAD : SchedWriteRes<[M1UnitALU, M1UnitC]> { let Latency = 2; let NumMicroOps = 2; } -def M1WriteAX : SchedWriteVariant<[SchedVar, - SchedVar]>; +def M1WriteAX : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M1WriteAY : SchedWriteVariant<[SchedVar, + SchedVar]>; def M1WriteC1 : SchedWriteRes<[M1UnitC]> { let Latency = 1; } def M1WriteC2 : SchedWriteRes<[M1UnitC]> { let Latency = 2; } def M1WriteB1 : SchedWriteRes<[M1UnitB]> { let Latency = 1; } -def M1WriteBX : SchedWriteVariant<[SchedVar, - SchedVar]>; +def M1WriteBX : SchedWriteVariant<[SchedVar, + SchedVar]>; def M1WriteL5 : SchedWriteRes<[M1UnitL]> { let Latency = 5; } def M1WriteL6 : SchedWriteRes<[M1UnitL]> { let Latency = 6; } @@ -111,40 +105,27 @@ def M1WriteLD : SchedWriteRes<[M1UnitL, let ResourceCycles = [2, 1]; } def M1WriteLH : SchedWriteRes<[]> { let Latency = 5; let NumMicroOps = 0; } -def M1WriteLX : SchedWriteVariant<[SchedVar, - SchedVar]>; -def M1WriteLY : SchedWriteVariant<[SchedVar, - SchedVar]>; +def M1WriteLX : SchedWriteVariant<[SchedVar, + SchedVar]>; def M1WriteS1 : SchedWriteRes<[M1UnitS]> { let Latency = 1; } def M1WriteS3 : SchedWriteRes<[M1UnitS]> { let Latency = 3; } def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; } def M1WriteSA : SchedWriteRes<[M1UnitS, M1UnitFST, - M1UnitS, - M1UnitFST]> { let Latency = 1; - let NumMicroOps = 2; } -def M1WriteSB : SchedWriteRes<[M1UnitS, - M1UnitFST, M1UnitA]> { let Latency = 3; let NumMicroOps = 2; } -def M1WriteSC : SchedWriteRes<[M1UnitS, +def M1WriteSB : SchedWriteRes<[M1UnitS, M1UnitFST, M1UnitS, M1UnitFST, M1UnitA]> { let Latency = 3; let NumMicroOps = 3; } -def M1WriteSD : SchedWriteRes<[M1UnitS, - M1UnitFST, - M1UnitA]> { let Latency = 1; - let NumMicroOps = 2; } -def M1WriteSE : SchedWriteRes<[M1UnitS, +def M1WriteSC : SchedWriteRes<[M1UnitS, M1UnitA]> { let Latency = 2; let NumMicroOps = 2; } -def M1WriteSX : SchedWriteVariant<[SchedVar, - SchedVar]>; -def M1WriteSY : SchedWriteVariant<[SchedVar, - SchedVar]>; +def M1WriteSX : SchedWriteVariant<[SchedVar, + SchedVar]>; def M1ReadAdrBase : SchedReadVariant<[SchedVar, SchedVar]>; @@ -415,9 +396,9 @@ def M1WriteVSTH : SchedWriteRes<[M1UnitNALU, M1UnitS, M1UnitFST, M1UnitFST, - M1UnitFST]> { let Latency = 14; - let NumMicroOps = 4; - let ResourceCycles = [1, 7, 1, 7, 1]; } + M1UnitFST]> { let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [1, 7, 1, 7, 1]; } def M1WriteVSTI : SchedWriteRes<[M1UnitNALU, M1UnitS, M1UnitFST, @@ -428,9 +409,17 @@ def M1WriteVSTI : SchedWriteRes<[M1UnitNALU, M1UnitS, M1UnitFST, M1UnitFST, - M1UnitFST]> { let Latency = 17; - let NumMicroOps = 7; - let ResourceCycles = [1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 1]; } + M1UnitFST]> { let Latency = 17; + let NumMicroOps = 7; + let ResourceCycles = [1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 1]; } + +// Special cases. +def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; } +def M1WriteCOPY : SchedWriteVariant<[SchedVar, + SchedVar]>; + +// Fast forwarding. +def M1ReadAES : SchedReadAdvance<1, [M1WriteAES]>; // Branch instructions def : InstRW<[M1WriteB1], (instrs Bcc)>; @@ -440,21 +429,34 @@ def : InstRW<[M1WriteC1], (instregex "^CBN?Z[WX]")>; def : InstRW<[M1WriteAD], (instregex "^TBN?Z[WX]")>; // Arithmetic and logical integer instructions. -def : InstRW<[M1WriteA1], (instrs COPY)>; -def : InstRW<[M1WriteAX], (instregex ".+r[sx](64)?$")>; +def : InstRW<[M1WriteAX], (instregex ".+rx(64)?$")>; +def : InstRW<[M1WriteAY], (instregex ".+rs$")>; + +// Move instructions. +def : InstRW<[M1WriteCOPY], (instrs COPY)>; // Divide and multiply instructions. // Miscellaneous instructions. // Load instructions. +def : InstRW<[M1WriteLC, + ReadAdrBase], (instregex "^LDR(BB|HH|SBW|SBX|SHW|SWX|SW|W|X)roW")>; +def : InstRW<[M1WriteL5, + ReadAdrBase], (instregex "^LDR(BB|HH|SBW|SBX|SHW|SWX|SW|W|X)roX")>; def : InstRW<[M1WriteLB, WriteLDHi, WriteAdr], (instregex "^LDP(SW|W|X)(post|pre)")>; -def : InstRW<[M1WriteLX, - ReadAdrBase], (instregex "^PRFMro[WX]")>; +def : InstRW<[M1WriteLC, + ReadAdrBase], (instrs PRFMroW)>; +def : InstRW<[M1WriteL5, + ReadAdrBase], (instrs PRFMroX)>; // Store instructions. +def : InstRW<[M1WriteSC, + ReadAdrBase], (instregex "^STR(BB|HH|W|X)roW")>; +def : InstRW<[WriteST, + ReadAdrBase], (instregex "^STR(BB|HH|W|X)roX")>; // FP data instructions. def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)[DS]r")>; @@ -488,8 +490,10 @@ def : InstRW<[WriteVLD], (instregex "^LDUR[BDHSQ]i")>; def : InstRW<[WriteVLD, WriteAdr], (instregex "^LDR[BDHSQ](post|pre)")>; def : InstRW<[WriteVLD], (instregex "^LDR[BDHSQ]ui")>; -def : InstRW<[M1WriteLY, - ReadAdrBase], (instregex "^LDR[BDHS]ro[WX]")>; +def : InstRW<[M1WriteLD, + ReadAdrBase], (instregex "^LDR[BDHS]roW")>; +def : InstRW<[WriteVLD, + ReadAdrBase], (instregex "^LDR[BDHS]roX")>; def : InstRW<[M1WriteLD, ReadAdrBase], (instregex "^LDRQro[WX]")>; def : InstRW<[WriteVLD, @@ -508,14 +512,16 @@ def : InstRW<[WriteVST], (instregex "^STUR[BDHSQ]i")>; def : InstRW<[WriteVST, WriteAdr], (instregex "^STR[BDHSQ](post|pre)")>; def : InstRW<[WriteVST], (instregex "^STR[BDHSQ]ui")>; -def : InstRW<[M1WriteSY, - ReadAdrBase], (instregex "^STR[BDHS]ro[WX]")>; -def : InstRW<[M1WriteSB, +def : InstRW<[M1WriteSA, + ReadAdrBase], (instregex "^STR[BDHS]roW")>; +def : InstRW<[WriteVST, + ReadAdrBase], (instregex "^STR[BDHS]roX")>; +def : InstRW<[M1WriteSA, ReadAdrBase], (instregex "^STRQro[WX]")>; def : InstRW<[WriteVST], (instregex "^STN?P[DSQ]i")>; def : InstRW<[WriteVST, WriteAdr], (instregex "^STP[DS](post|pre)")>; -def : InstRW<[M1WriteSC, +def : InstRW<[M1WriteSB, WriteAdr], (instregex "^STPQ(post|pre)")>; // ASIMD instructions. @@ -609,21 +615,21 @@ def : InstRW<[M1WriteVLDE], (instregex "LD1i(64)$")>; def : InstRW<[M1WriteVLDE, WriteAdr], (instregex "LD1i(64)_POST$")>; -def : InstRW<[M1WriteL5], (instregex "LD1Rv(8b|4h|2s)$")>; -def : InstRW<[M1WriteL5, +def : InstRW<[WriteVLD], (instregex "LD1Rv(8b|4h|2s)$")>; +def : InstRW<[WriteVLD, WriteAdr], (instregex "LD1Rv(8b|4h|2s)_POST$")>; -def : InstRW<[M1WriteL5], (instregex "LD1Rv(1d)$")>; -def : InstRW<[M1WriteL5, +def : InstRW<[WriteVLD], (instregex "LD1Rv(1d)$")>; +def : InstRW<[WriteVLD, WriteAdr], (instregex "LD1Rv(1d)_POST$")>; -def : InstRW<[M1WriteL5], (instregex "LD1Rv(16b|8h|4s|2d)$")>; -def : InstRW<[M1WriteL5, +def : InstRW<[WriteVLD], (instregex "LD1Rv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLD, WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>; -def : InstRW<[M1WriteL5], (instregex "LD1Onev(8b|4h|2s|1d)$")>; -def : InstRW<[M1WriteL5, +def : InstRW<[WriteVLD], (instregex "LD1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteVLD, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>; -def : InstRW<[M1WriteL5], (instregex "LD1Onev(16b|8h|4s|2d)$")>; -def : InstRW<[M1WriteL5, +def : InstRW<[WriteVLD], (instregex "LD1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLD, WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>; def : InstRW<[M1WriteVLDA], (instregex "LD1Twov(8b|4h|2s|1d)$")>; def : InstRW<[M1WriteVLDA, @@ -831,8 +837,6 @@ def : InstRW<[M1WriteVSTI, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>; // Cryptography instructions. -def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; } -def M1ReadAES : SchedReadAdvance<1, [M1WriteAES]>; def : InstRW<[M1WriteAES], (instregex "^AES[DE]")>; def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AESI?MC")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td index e61fb61..39f448e 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td @@ -104,20 +104,6 @@ def M3UnitNSHF : ProcResGroup<[M3UnitNSHF0, M3UnitNSHF2]>; //===----------------------------------------------------------------------===// -// Predicates. - -def M3BranchLinkPred : SchedPredicate<[{MI->getOpcode() == AArch64::BLR && - MI->getOperand(0).isReg() && - MI->getOperand(0).getReg() != AArch64::LR}]>; -def M3ResetPred : SchedPredicate<[{TII->isExynosResetFast(*MI)}]>; -def M3RotatePred : SchedPredicate<[{(MI->getOpcode() == AArch64::EXTRWrri || - MI->getOpcode() == AArch64::EXTRXrri) && - MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && - MI->getOperand(1).getReg() == MI->getOperand(2).getReg()}]>; -def M3LdStExtPred : SchedPredicate<[{TII->isExynosLdStExtFast(*MI)}]>; -def M3ShiftExtPred : SchedPredicate<[{TII->isExynosShiftExtFast(*MI)}]>; - -//===----------------------------------------------------------------------===// // Coarse scheduling model. def M3WriteZ0 : SchedWriteRes<[]> { let Latency = 0; @@ -138,15 +124,23 @@ def M3WriteAD : SchedWriteRes<[M3UnitALU, let NumMicroOps = 2; } def M3WriteC1 : SchedWriteRes<[M3UnitC]> { let Latency = 1; } def M3WriteC2 : SchedWriteRes<[M3UnitC]> { let Latency = 2; } -def M3WriteAX : SchedWriteVariant<[SchedVar, - SchedVar, - SchedVar]>; -def M3WriteAY : SchedWriteVariant<[SchedVar, - SchedVar]>; +def M3WriteAU : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar]>; +def M3WriteAV : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M3WriteAW : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M3WriteAX : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M3WriteAY : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M3WriteAZ : SchedWriteVariant<[SchedVar, + SchedVar]>; def M3WriteB1 : SchedWriteRes<[M3UnitB]> { let Latency = 1; } -def M3WriteBX : SchedWriteVariant<[SchedVar, - SchedVar]>; +def M3WriteBX : SchedWriteVariant<[SchedVar, + SchedVar]>; def M3WriteL4 : SchedWriteRes<[M3UnitL]> { let Latency = 4; } def M3WriteL5 : SchedWriteRes<[M3UnitL]> { let Latency = 5; } @@ -163,28 +157,24 @@ def M3WriteLC : SchedWriteRes<[M3UnitA, def M3WriteLD : SchedWriteRes<[M3UnitA, M3UnitL]> { let Latency = 4; let NumMicroOps = 2; } +def M3WriteLE : SchedWriteRes<[M3UnitA, + M3UnitL]> { let Latency = 6; + let NumMicroOps = 2; } def M3WriteLH : SchedWriteRes<[]> { let Latency = 5; let NumMicroOps = 0; } - -def M3WriteLX : SchedWriteVariant<[SchedVar, - SchedVar]>; +def M3WriteLX : SchedWriteVariant<[SchedVar, + SchedVar]>; def M3WriteS1 : SchedWriteRes<[M3UnitS]> { let Latency = 1; } def M3WriteSA : SchedWriteRes<[M3UnitA, M3UnitS, - M3UnitFST]> { let Latency = 2; + M3UnitFST]> { let Latency = 3; let NumMicroOps = 2; } def M3WriteSB : SchedWriteRes<[M3UnitA, - M3UnitS]> { let Latency = 1; - let NumMicroOps = 2; } -def M3WriteSC : SchedWriteRes<[M3UnitA, M3UnitS]> { let Latency = 2; let NumMicroOps = 2; } - -def M3WriteSX : SchedWriteVariant<[SchedVar, - SchedVar]>; -def M3WriteSY : SchedWriteVariant<[SchedVar, - SchedVar]>; +def M3WriteSX : SchedWriteVariant<[SchedVar, + SchedVar]>; def M3ReadAdrBase : SchedReadVariant<[SchedVar, SchedVar]>; @@ -214,9 +204,7 @@ def : WriteRes { let Latency = 4; let ResourceCycles = [2]; } // Miscellaneous instructions. -def : WriteRes { let Latency = 1; - let NumMicroOps = 2; } +def : SchedAlias; // Addressing modes. def : WriteRes { let Latency = 1; @@ -479,11 +467,15 @@ def M3WriteVSTI : SchedWriteRes<[M3UnitNALU, // Special cases. def M3WriteAES : SchedWriteRes<[M3UnitNCRY]> { let Latency = 1; } +def M3WriteCOPY : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M3WriteMOVI : SchedWriteVariant<[SchedVar, + SchedVar]>; + +// Fast forwarding. def M3ReadAES : SchedReadAdvance<1, [M3WriteAES]>; def M3ReadFMAC : SchedReadAdvance<1, [M3WriteFMAC4, M3WriteFMAC5]>; -def M3WriteMOVI : SchedWriteVariant<[SchedVar, - SchedVar]>; def M3ReadNMUL : SchedReadAdvance<1, [M3WriteNMUL3]>; // Branch instructions @@ -494,29 +486,40 @@ def : InstRW<[M3WriteC1], (instregex "^CBN?Z[WX]")>; def : InstRW<[M3WriteAD], (instregex "^TBN?Z[WX]")>; // Arithmetic and logical integer instructions. -def : InstRW<[M3WriteA1], (instrs COPY)>; -def : InstRW<[M3WriteAX], (instregex "^(ADD|SUB)S?Xrx64")>; -def : InstRW<[M3WriteAX], (instregex "^(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[sx]$")>; -def : InstRW<[M3WriteAX], (instregex "^(ADD|BIC|SUB)S[WX]r[sx]$")>; -def : InstRW<[M3WriteAX], (instregex "^(ADD|AND|EOR|ORR|SUB)[WX]ri")>; +def : InstRW<[M3WriteAZ], (instregex "^(ADD|AND|BIC|EON|EOR|ORN|SUB)[WX]rs$")>; +def : InstRW<[M3WriteAU], (instrs ORRWrs, ORRXrs)>; +def : InstRW<[M3WriteAX], (instregex "^(ADD|SUB)S?[WX]rx(64)?$")>; +def : InstRW<[M3WriteAZ], (instregex "^(ADD|AND|BIC|SUB)S[WX]rs$")>; +def : InstRW<[M3WriteAV], (instrs ADDWri, ADDXri)>; +def : InstRW<[M3WriteAW], (instrs ORRWri, ORRXri)>; // Move instructions. -def : InstRW<[M3WriteZ0], (instrs ADR, ADRP)>; -def : InstRW<[M3WriteZ0], (instregex "^MOV[NZ][WX]i")>; +def : InstRW<[M3WriteCOPY], (instrs COPY)>; +def : InstRW<[M3WriteZ0], (instrs ADR, ADRP)>; +def : InstRW<[M3WriteZ0], (instregex "^MOV[NZ][WX]i")>; // Divide and multiply instructions. // Miscellaneous instructions. -def : InstRW<[M3WriteAY], (instrs EXTRWrri, EXTRXrri)>; // Load instructions. +def : InstRW<[M3WriteLB, + ReadAdrBase], (instregex "^LDR(BB|HH|SBW|SBX|SHW|SWX|SW|W|X)roW")>; +def : InstRW<[M3WriteL5, + ReadAdrBase], (instregex "^LDR(BB|HH|SBW|SBX|SHW|SWX|SW|W|X)roX")>; def : InstRW<[M3WriteLD, WriteLDHi, WriteAdr], (instregex "^LDP(SW|W|X)(post|pre)")>; -def : InstRW<[M3WriteLX, - ReadAdrBase], (instregex "^PRFMro[WX]")>; +def : InstRW<[M3WriteLB, + ReadAdrBase], (instrs PRFMroW)>; +def : InstRW<[M3WriteL5, + ReadAdrBase], (instrs PRFMroX)>; // Store instructions. +def : InstRW<[M3WriteSB, + ReadAdrBase], (instregex "^STR(BB|HH|W|X)roW")>; +def : InstRW<[WriteST, + ReadAdrBase], (instregex "^STR(BB|HH|W|X)roX")>; // FP data instructions. def : InstRW<[M3WriteNSHF1], (instregex "^FABS[DS]r")>; @@ -553,9 +556,11 @@ def : InstRW<[WriteVLD], (instregex "^LDUR[BDHSQ]i")>; def : InstRW<[WriteVLD, WriteAdr], (instregex "^LDR[BDHSQ](post|pre)")>; def : InstRW<[WriteVLD], (instregex "^LDR[BDHSQ]ui")>; -def : InstRW<[M3WriteLX, - ReadAdrBase], (instregex "^LDR[BDHS]ro[WX]")>; -def : InstRW<[M3WriteLB, +def : InstRW<[M3WriteLE, + ReadAdrBase], (instregex "^LDR[BDHS]roW")>; +def : InstRW<[WriteVLD, + ReadAdrBase], (instregex "^LDR[BDHS]roX")>; +def : InstRW<[M3WriteLE, ReadAdrBase], (instregex "^LDRQro[WX]")>; def : InstRW<[WriteVLD, M3WriteLH], (instregex "^LDN?P[DS]i")>; @@ -573,8 +578,10 @@ def : InstRW<[WriteVST], (instregex "^STUR[BDHSQ]i")>; def : InstRW<[WriteVST, WriteAdr], (instregex "^STR[BDHSQ](post|pre)")>; def : InstRW<[WriteVST], (instregex "^STR[BDHSQ]ui")>; -def : InstRW<[M3WriteSY, - ReadAdrBase], (instregex "^STR[BDHS]ro[WX]")>; +def : InstRW<[M3WriteSA, + ReadAdrBase], (instregex "^STR[BDHS]roW")>; +def : InstRW<[WriteVST, + ReadAdrBase], (instregex "^STR[BDHS]roX")>; def : InstRW<[M3WriteSA, ReadAdrBase], (instregex "^STRQro[WX]")>; def : InstRW<[WriteVST], (instregex "^STN?P[DSQ]i")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredExynos.td b/llvm/lib/Target/AArch64/AArch64SchedPredExynos.td new file mode 100644 index 0000000..dcecbae --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedPredExynos.td @@ -0,0 +1,124 @@ +//===- AArch64SchedPredExynos.td - AArch64 Sched Preds -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines scheduling predicate definitions that are used by the +// AArch64 Exynos processors. +// +//===----------------------------------------------------------------------===// + +// Identify BLR specifying the LR register as the indirect target register. +def ExynosBranchLinkLRPred : MCSchedPredicate< + CheckAll<[CheckOpcode<[BLR]>, + CheckRegOperand<0, LR>]>>; + +// Identify arithmetic and logic instructions without or with limited extension. +def ExynosExtFn : TIIPredicate< + "isExynosExtFast", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsArithExt32Op.ValidOpcodes, + MCReturnStatement< + CheckAny<[CheckExtBy0, + CheckAll< + [CheckExtUXTW, + CheckAny< + [CheckExtBy1, + CheckExtBy2, + CheckExtBy3]>]>]>>>, + MCOpcodeSwitchCase< + IsArithExt64Op.ValidOpcodes, + MCReturnStatement< + CheckAny<[CheckExtBy0, + CheckAll< + [CheckExtUXTX, + CheckAny< + [CheckExtBy1, + CheckExtBy2, + CheckExtBy3]>]>]>>>], + MCReturnStatement>>; +def ExynosExtPred : MCSchedPredicate; + +// Identify FP instructions. +def ExynosFPPred : MCSchedPredicate>; + +// Identify whether an instruction whose result is a long vector +// operates on the upper half of the input registers. +def ExynosLongVectorUpperFn : TIIPredicate< + "isExynosLongVectorUpper", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsLongVectorUpperOp.ValidOpcodes, + MCReturnStatement>], + MCReturnStatement>>; +def ExynosLongVectorUpperPred : MCSchedPredicate; + +// Identify 128-bit NEON instructions. +def ExynosQFormPred : MCSchedPredicate; + +// Identify instructions that reset a register efficiently. +def ExynosResetFn : TIIPredicate< + "isExynosResetFast", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + [ADR, ADRP, + MOVNWi, MOVNXi, + MOVZWi, MOVZXi], + MCReturnStatement>], + MCReturnStatement< + CheckAny< + [IsCopyIdiomFn, + IsZeroFPIdiomFn, + IsZeroIdiomFn]>>>>; +def ExynosResetPred : MCSchedPredicate; + +// Identify EXTR as the alias for ROR (immediate). +def ExynosRotateRightImmPred : MCSchedPredicate< + CheckAll<[CheckOpcode<[EXTRWrri, EXTRXrri]>, + CheckSameRegOperand<1, 2>]>>; + +// Identify arithmetic and logic instructions without or with limited shift. +def ExynosShiftFn : TIIPredicate< + "isExynosShiftFast", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsArithLogicShiftOp.ValidOpcodes, + MCReturnStatement< + CheckAny<[CheckShiftBy0, + CheckAll< + [CheckShiftLSL, + CheckAny< + [CheckShiftBy1, + CheckShiftBy2, + CheckShiftBy3]>]>]>>>], + MCReturnStatement>>; +def ExynosShiftPred : MCSchedPredicate; + +// Identify more arithmetic and logic instructions without or limited shift. +def ExynosShiftExFn : TIIPredicate< + "isExynosShiftExFast", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsArithLogicShiftOp.ValidOpcodes, + MCReturnStatement< + CheckAll< + [CheckShiftLSL, + CheckShiftBy8]>>>], + MCReturnStatement>>; +def ExynosShiftExPred : MCSchedPredicate; + +// Identify arithmetic and logic immediate instructions. +def ExynosCheapFn : TIIPredicate< + "isExynosCheapAsMove", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsArithLogicImmOp.ValidOpcodes, + MCReturnStatement>], + MCReturnStatement< + CheckAny< + [ExynosExtFn, ExynosResetFn, ExynosShiftFn]>>>>; diff --git a/llvm/test/tools/llvm-mca/AArch64/Exynos/extended-register.s b/llvm/test/tools/llvm-mca/AArch64/Exynos/extended-register.s index 58fb4f0..c580a8d 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Exynos/extended-register.s +++ b/llvm/test/tools/llvm-mca/AArch64/Exynos/extended-register.s @@ -3,31 +3,31 @@ # RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,EM3 sub w0, w1, w2, sxtb #0 - add w3, w4, w5, sxth #1 + add x3, x4, w5, sxth #1 subs x6, x7, w8, uxtw #2 adds x9, x10, x11, uxtx #3 sub w12, w13, w14, uxtb #3 - add w15, w16, w17, uxth #2 + add x15, x16, w17, uxth #2 subs x18, x19, w20, sxtw #1 adds x21, x22, x23, sxtx #0 # ALL: Iterations: 100 # ALL-NEXT: Instructions: 800 -# EM1-NEXT: Total Cycles: 537 -# EM3-NEXT: Total Cycles: 403 +# EM1-NEXT: Total Cycles: 403 +# EM3-NEXT: Total Cycles: 303 # ALL-NEXT: Total uOps: 800 # EM1: Dispatch Width: 4 -# EM1-NEXT: uOps Per Cycle: 1.49 -# EM1-NEXT: IPC: 1.49 -# EM1-NEXT: Block RThroughput: 5.3 +# EM1-NEXT: uOps Per Cycle: 1.99 +# EM1-NEXT: IPC: 1.99 +# EM1-NEXT: Block RThroughput: 4.0 # EM3: Dispatch Width: 6 -# EM3-NEXT: uOps Per Cycle: 1.99 -# EM3-NEXT: IPC: 1.99 -# EM3-NEXT: Block RThroughput: 4.0 +# EM3-NEXT: uOps Per Cycle: 2.64 +# EM3-NEXT: IPC: 2.64 +# EM3-NEXT: Block RThroughput: 3.0 # ALL: Instruction Info: # ALL-NEXT: [1]: #uOps @@ -39,20 +39,20 @@ # ALL: [1] [2] [3] [4] [5] [6] Instructions: -# EM1-NEXT: 1 2 0.67 sub w0, w1, w2, sxtb -# EM1-NEXT: 1 2 0.67 add w3, w4, w5, sxth #1 -# EM1-NEXT: 1 2 0.67 subs x6, x7, w8, uxtw #2 -# EM1-NEXT: 1 2 0.67 adds x9, x10, x11, uxtx #3 +# EM1-NEXT: 1 1 0.33 sub w0, w1, w2, sxtb +# EM1-NEXT: 1 2 0.67 add x3, x4, w5, sxth #1 +# EM1-NEXT: 1 1 0.33 subs x6, x7, w8, uxtw #2 +# EM1-NEXT: 1 1 0.33 adds x9, x10, x11, uxtx #3 # EM1-NEXT: 1 2 0.67 sub w12, w13, w14, uxtb #3 -# EM1-NEXT: 1 2 0.67 add w15, w16, w17, uxth #2 +# EM1-NEXT: 1 2 0.67 add x15, x16, w17, uxth #2 # EM1-NEXT: 1 2 0.67 subs x18, x19, w20, sxtw #1 -# EM1-NEXT: 1 2 0.67 adds x21, x22, x23, sxtx +# EM1-NEXT: 1 1 0.33 adds x21, x22, x23, sxtx -# EM3-NEXT: 1 2 0.50 sub w0, w1, w2, sxtb -# EM3-NEXT: 1 2 0.50 add w3, w4, w5, sxth #1 -# EM3-NEXT: 1 2 0.50 subs x6, x7, w8, uxtw #2 -# EM3-NEXT: 1 2 0.50 adds x9, x10, x11, uxtx #3 +# EM3-NEXT: 1 1 0.25 sub w0, w1, w2, sxtb +# EM3-NEXT: 1 2 0.50 add x3, x4, w5, sxth #1 +# EM3-NEXT: 1 1 0.25 subs x6, x7, w8, uxtw #2 +# EM3-NEXT: 1 1 0.25 adds x9, x10, x11, uxtx #3 # EM3-NEXT: 1 2 0.50 sub w12, w13, w14, uxtb #3 -# EM3-NEXT: 1 2 0.50 add w15, w16, w17, uxth #2 +# EM3-NEXT: 1 2 0.50 add x15, x16, w17, uxth #2 # EM3-NEXT: 1 2 0.50 subs x18, x19, w20, sxtw #1 -# EM3-NEXT: 1 2 0.50 adds x21, x22, x23, sxtx +# EM3-NEXT: 1 1 0.25 adds x21, x22, x23, sxtx diff --git a/llvm/test/tools/llvm-mca/AArch64/Exynos/register-offset.s b/llvm/test/tools/llvm-mca/AArch64/Exynos/register-offset.s index fe1c75e..b31b396 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Exynos/register-offset.s +++ b/llvm/test/tools/llvm-mca/AArch64/Exynos/register-offset.s @@ -9,20 +9,17 @@ # ALL: Iterations: 100 # ALL-NEXT: Instructions: 400 - -# EM1-NEXT: Total Cycles: 408 -# EM3-NEXT: Total Cycles: 208 - -# ALL-NEXT: Total uOps: 800 +# ALL-NEXT: Total Cycles: 308 +# ALL-NEXT: Total uOps: 600 # EM1: Dispatch Width: 4 -# EM1-NEXT: uOps Per Cycle: 1.96 -# EM1-NEXT: IPC: 0.98 +# EM1-NEXT: uOps Per Cycle: 1.95 +# EM1-NEXT: IPC: 1.30 # EM1-NEXT: Block RThroughput: 2.0 # EM3: Dispatch Width: 6 -# EM3-NEXT: uOps Per Cycle: 3.85 -# EM3-NEXT: IPC: 1.92 +# EM3-NEXT: uOps Per Cycle: 1.95 +# EM3-NEXT: IPC: 1.30 # EM3-NEXT: Block RThroughput: 2.0 # ALL: Instruction Info: @@ -35,12 +32,12 @@ # ALL: [1] [2] [3] [4] [5] [6] Instructions: -# EM1-NEXT: 2 5 1.00 * ldr w0, [x1, x2] -# EM1-NEXT: 2 2 1.00 * str x3, [x4, w5, sxtw] -# EM1-NEXT: 2 5 1.00 * ldr x6, [x7, w8, uxtw #3] -# EM1-NEXT: 2 2 1.00 * str x9, [x10, x11, lsl #3] +# EM1-NEXT: 1 5 1.00 * ldr w0, [x1, x2] +# EM3-NEXT: 1 5 0.50 * ldr w0, [x1, x2] -# EM3-NEXT: 2 5 0.50 * ldr w0, [x1, x2] -# EM3-NEXT: 2 1 1.00 * str x3, [x4, w5, sxtw] +# ALL-NEXT: 2 2 1.00 * str x3, [x4, w5, sxtw] + +# EM1-NEXT: 2 5 1.00 * ldr x6, [x7, w8, uxtw #3] # EM3-NEXT: 2 5 0.50 * ldr x6, [x7, w8, uxtw #3] -# EM3-NEXT: 2 1 1.00 * str x9, [x10, x11, lsl #3] + +# ALL-NEXT: 1 1 1.00 * str x9, [x10, x11, lsl #3] diff --git a/llvm/test/tools/llvm-mca/AArch64/Exynos/shifted-register.s b/llvm/test/tools/llvm-mca/AArch64/Exynos/shifted-register.s index 4b73836..e37d2d0 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Exynos/shifted-register.s +++ b/llvm/test/tools/llvm-mca/AArch64/Exynos/shifted-register.s @@ -10,20 +10,20 @@ # ALL: Iterations: 100 # ALL-NEXT: Instructions: 400 -# EM1-NEXT: Total Cycles: 271 -# EM3-NEXT: Total Cycles: 203 +# EM1-NEXT: Total Cycles: 204 +# EM3-NEXT: Total Cycles: 154 # ALL-NEXT: Total uOps: 400 # EM1: Dispatch Width: 4 -# EM1-NEXT: uOps Per Cycle: 1.48 -# EM1-NEXT: IPC: 1.48 -# EM1-NEXT: Block RThroughput: 2.7 +# EM1-NEXT: uOps Per Cycle: 1.96 +# EM1-NEXT: IPC: 1.96 +# EM1-NEXT: Block RThroughput: 2.0 # EM3: Dispatch Width: 6 -# EM3-NEXT: uOps Per Cycle: 1.97 -# EM3-NEXT: IPC: 1.97 -# EM3-NEXT: Block RThroughput: 2.0 +# EM3-NEXT: uOps Per Cycle: 2.60 +# EM3-NEXT: IPC: 2.60 +# EM3-NEXT: Block RThroughput: 1.5 # ALL: Instruction Info: # ALL-NEXT: [1]: #uOps @@ -35,12 +35,12 @@ # ALL: [1] [2] [3] [4] [5] [6] Instructions: -# EM1-NEXT: 1 2 0.67 add w0, w1, w2 +# EM1-NEXT: 1 1 0.33 add w0, w1, w2 # EM1-NEXT: 1 2 0.67 sub x3, x4, x5, lsr #1 -# EM1-NEXT: 1 2 0.67 adds x6, x7, x8, lsl #2 +# EM1-NEXT: 1 1 0.33 adds x6, x7, x8, lsl #2 # EM1-NEXT: 1 2 0.67 subs w9, w10, w11, asr #3 -# EM3-NEXT: 1 2 0.50 add w0, w1, w2 +# EM3-NEXT: 1 1 0.25 add w0, w1, w2 # EM3-NEXT: 1 2 0.50 sub x3, x4, x5, lsr #1 -# EM3-NEXT: 1 2 0.50 adds x6, x7, x8, lsl #2 +# EM3-NEXT: 1 1 0.25 adds x6, x7, x8, lsl #2 # EM3-NEXT: 1 2 0.50 subs w9, w10, w11, asr #3 -- 2.7.4