// Two cycle ALU vector operation that uses an entire superslice.
// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
-// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
-def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C, DISP_1C],
+// (EXECE, EXECO) and 1 dispatches (DISP) to the given superslice.
+def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
(instrs
(instregex "VADDU(B|H|W|D)M$"),
(instregex "VAND(C)?$"),
// Restricted Dispatch ALU operation for 3 cycles. The operation runs on a
// single slice. However, since it is Restricted it requires all 3 dispatches
// (DISP) for that superslice.
-def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_3SLOTS_1C],
(instrs
(instregex "TABORT(D|W)C(I)?$"),
(instregex "MTFSB(0|1)$"),
)>;
// Standard Dispatch ALU operation for 3 cycles. Only one slice used.
-def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C],
(instrs
(instregex "XSMAX(C|J)?DP$"),
(instregex "XSMIN(C|J)?DP$"),
)>;
// Standard Dispatch ALU operation for 2 cycles. Only one slice used.
-def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C],
(instrs
(instregex "S(L|R)D$"),
(instregex "SRAD(I)?$"),
// Restricted Dispatch ALU operation for 2 cycles. The operation runs on a
// single slice. However, since it is Restricted it requires all 3 dispatches
// (DISP) for that superslice.
-def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_3SLOTS_1C],
(instrs
(instregex "RLDC(L|R)$"),
(instregex "RLWIMI(8)?$"),
// Three cycle ALU vector operation that uses an entire superslice.
// Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
-// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
-def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C, DISP_1C],
+// (EXECE, EXECO) and 1 dispatches (DISP) to the given superslice.
+def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
(instrs
(instregex "M(T|F)VSCR$"),
(instregex "VCMPNEZ(B|H|W)$"),
// 7 cycle DP vector operation that uses an entire superslice.
// Uses both DP units (the even DPE and odd DPO units), two pipelines
// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
-def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
(instrs
VADDFP,
VCTSXS,
// 5 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three
// dispatch units for the superslice.
-def : InstRW<[P9_DP_5C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DP_5C, IP_EXEC_1C, DISP_3SLOTS_1C],
(instrs
(instregex "MADD(HD|HDU|LD|LD8)$"),
(instregex "MUL(HD|HW|LD|LI|LI8|LW)(U)?$")
// 7 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three
// dispatch units for the superslice.
-def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_3SLOTS_1C],
(instrs
FRSP,
(instregex "FRI(N|P|Z|M)(D|S)$"),
// 7 cycle Restricted DP operation and one 3 cycle ALU operation.
// These operations can be done in parallel.
-// The DP is restricted so we need a full 5 dispatches.
+// The DP is restricted so we need a full 4 dispatches.
def : InstRW<[P9_DP_7C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "FSEL(D|S)o$")
)>;
// 5 Cycle Restricted DP operation and one 2 cycle ALU operation.
def : InstRW<[P9_DPOpAndALUOp_7C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "MUL(H|L)(D|W)(U)?o$")
)>;
// 7 cycle Restricted DP operation and one 3 cycle ALU operation.
// These operations must be done sequentially.
-// The DP is restricted so we need a full 5 dispatches.
+// The DP is restricted so we need a full 4 dispatches.
def : InstRW<[P9_DPOpAndALU2Op_10C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "FRI(N|P|Z|M)(D|S)o$"),
(instregex "FRE(S)?o$"),
FRSPo
)>;
-// 7 cycle DP operation. One DP unit, one EXEC pipeline and two dispatch units.
-def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
+// 7 cycle DP operation. One DP unit, one EXEC pipeline and 1 dispatch units.
+def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C],
(instrs
XSADDDP,
XSADDSP,
)>;
// Three Cycle PM operation. Only one PM unit per superslice so we use the whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// superslice. That includes both exec pipelines (EXECO, EXECE) and one
// dispatches.
-def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C],
(instrs
(instregex "LVS(L|R)$"),
(instregex "VSPLTIS(W|H|B)$"),
)>;
// 12 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// superslice. That includes both exec pipelines (EXECO, EXECE) and one
// dispatches.
-def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
(instrs
BCDSRo,
XSADDQP,
)>;
// 23 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// superslice. That includes both exec pipelines (EXECO, EXECE) and one
// dispatches.
-def : InstRW<[P9_DFU_23C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DFU_23C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
(instrs
BCDCTSQo
)>;
// 24 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// superslice. That includes both exec pipelines (EXECO, EXECE) and one
// dispatches.
-def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
(instrs
XSMADDQP,
XSMADDQPO,
)>;
// 37 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// superslice. That includes both exec pipelines (EXECO, EXECE) and one
// dispatches.
-def : InstRW<[P9_DFU_37C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DFU_37C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
(instrs
BCDCFSQo
)>;
// 58 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// superslice. That includes both exec pipelines (EXECO, EXECE) and one
// dispatches.
-def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
(instrs
XSDIVQP,
XSDIVQPO
// 76 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
// dispatches.
-def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
(instrs
XSSQRTQP,
XSSQRTQPO
)>;
// 6 Cycle Load uses a single slice.
-def : InstRW<[P9_LS_6C, IP_AGEN_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_LS_6C, IP_AGEN_1C, DISP_1C],
(instrs
(instregex "LXVL(L)?")
)>;
// 5 Cycle Load uses a single slice.
-def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C],
(instrs
(instregex "LVE(B|H|W)X$"),
(instregex "LVX(L)?"),
)>;
// 4 Cycle Load uses a single slice.
-def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C],
(instrs
(instregex "DCB(F|T|ST)(EP)?$"),
(instregex "DCBZ(L)?(EP)?$"),
// 4 Cycle Restricted load uses a single slice but the dispatch for the whole
// superslice.
-def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_3SLOTS_1C],
(instrs
LFIWZX,
LFDX,
// Cracked Load Instructions.
// Load instructions that can be done in parallel.
def : InstRW<[P9_LS_4C, P9_LS_4C, IP_AGEN_1C, IP_AGEN_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_PAIR_1C],
(instrs
SLBIA,
SLBIE,
// Requires Load and ALU pieces totaling 6 cycles. The Load and ALU
// operations can be run in parallel.
def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_AGEN_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_PAIR_1C, DISP_PAIR_1C],
+ (instrs
+ (instregex "L(W|H)ZU(X)?(8)?$")
+)>;
+
+// Cracked TEND Instruction.
+// Requires Load and ALU pieces totaling 6 cycles. The Load and ALU
+// operations can be run in parallel.
+def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_AGEN_1C,
+ DISP_1C, DISP_1C],
(instrs
- (instregex "L(W|H)ZU(X)?(8)?$"),
TEND
)>;
+
// Cracked Store Instruction
// Consecutive Store and ALU instructions. The store is restricted and requires
// three dispatches.
def : InstRW<[P9_StoreAndALUOp_3C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "ST(B|H|W|D)CX$")
)>;
// Cracked Load Instruction.
// Two consecutive load operations for a total of 8 cycles.
def : InstRW<[P9_LoadAndLoadOp_8C, IP_AGEN_1C, IP_AGEN_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_1C, DISP_1C],
(instrs
LDMX
)>;
// Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU
// operations cannot be done at the same time and so their latencies are added.
def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_1C, DISP_1C],
(instrs
(instregex "LHA(X)?(8)?$"),
(instregex "CP_PASTE(8)?o$"),
// operations cannot be done at the same time and so their latencies are added.
// Full 6 dispatches are required as this is both cracked and restricted.
def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_3SLOTS_1C],
(instrs
LFIWAX
)>;
// Requires consecutive Load and ALU pieces totaling 7 cycles. The Load and ALU
// operations cannot be done at the same time and so their latencies are added.
// Full 4 dispatches are required as this is a cracked instruction.
-def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C],
(instrs
LXSIWAX,
LIWAX
// their latencies are added.
// Full 6 dispatches are required as this is a restricted instruction.
def : InstRW<[P9_LoadAndALU2Op_7C, IP_AGEN_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_3SLOTS_1C],
(instrs
LFSX,
LFS
// Requires consecutive Load and ALU pieces totaling 8 cycles. The Load and ALU
// operations cannot be done at the same time and so their latencies are added.
// Full 4 dispatches are required as this is a cracked instruction.
-def : InstRW<[P9_LoadAndALU2Op_8C, IP_AGEN_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_LoadAndALU2Op_8C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C],
(instrs
LXSSP,
LXSSPX,
// Cracked 3-Way Load Instruction
// Load with two ALU operations that depend on each other
def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_PAIR_1C, DISP_PAIR_1C, DISP_1C],
(instrs
(instregex "LHAU(X)?(8)?$"),
LWAUX
// Since the Load and the PM cannot be done at the same time the latencies are
// added. Requires 8 cycles.
// Since the PM requires the full superslice we need both EXECE, EXECO pipelines
-// as well as 3 dispatches for the PM. The Load requires the remaining 2
+// as well as 1 dispatches for the PM. The Load requires the remaining 1
// dispatches.
def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_1C, DISP_1C],
(instrs
LXVH8X,
LXVDSX,
// Single slice Restricted store operation. The restricted operation requires
// all three dispatches for the superslice.
-def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_3SLOTS_1C],
(instrs
(instregex "STF(S|D|IWX|SX|DX)$"),
(instregex "STXS(D|DX|SPX|IWX|IBX|IHX|SP)(v)?$"),
)>;
// Vector Store Instruction
-// Requires the whole superslice and therefore requires all three dispatches
+// Requires the whole superslice and therefore requires one dispatches
// as well as both the Even and Odd exec pipelines.
-def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C,
- DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C, DISP_1C],
(instrs
(instregex "STVE(B|H|W)X$"),
(instregex "STVX(L)?$"),
)>;
// 5 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// superslice. That includes both exec pipelines (EXECO, EXECE) and two
// dispatches.
-def : InstRW<[P9_DIV_5C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DIV_5C, IP_EXECE_1C, IP_EXECO_1C, DISP_EVEN_1C],
(instrs
(instregex "MTCTR(8)?(loop)?$"),
(instregex "MTLR(8)?$")
)>;
// 12 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// superslice. That includes both exec pipelines (EXECO, EXECE) and two
// dispatches.
-def : InstRW<[P9_DIV_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DIV_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_EVEN_1C],
(instrs
(instregex "M(T|F)VRSAVE(v)?$"),
(instregex "M(T|F)PMR$"),
)>;
// 16 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// superslice. That includes both exec pipelines (EXECO, EXECE) and two
// dispatches.
-def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C,
- DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C],
(instrs
DIVW,
DIVWU,
)>;
// 24 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+// superslice. That includes both exec pipelines (EXECO, EXECE) and two
// dispatches.
-def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C,
- DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C],
(instrs
DIVWE,
DIVD,
// 40 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
// dispatches.
-def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C,
- DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C],
(instrs
DIVDE,
DIVDEU
// and one full superslice for the DIV operation since there is only one DIV
// per superslice. Latency of DIV plus ALU is 26.
def : InstRW<[P9_IntDivAndALUOp_18C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_EVEN_1C, DISP_1C],
(instrs
(instregex "DIVW(U)?(O)?o$")
)>;
// and one full superslice for the DIV operation since there is only one DIV
// per superslice. Latency of DIV plus ALU is 26.
def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_EVEN_1C, DISP_1C],
(instrs
DIVDo,
DIVDUo,
// and one full superslice for the DIV operation since there is only one DIV
// per superslice. Latency of DIV plus ALU is 42.
def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_EVEN_1C, DISP_1C],
(instrs
DIVDEo,
DIVDEUo
// instructions running together on two pipelines and 6 dispatches.
// ALU ops are 2 cycles each.
def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_3SLOTS_1C],
(instrs
MTCRF,
MTCRF8
// Cracked ALU operations.
// Here the two ALU ops can actually be done in parallel and therefore the
// latencies are not added together. Otherwise this is like having two
-// instructions running together on two pipelines and 4 dispatches.
+// instructions running together on two pipelines and 2 dispatches.
// ALU ops are 2 cycles each.
def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_1C, DISP_1C],
(instrs
(instregex "ADDC(8)?o$"),
(instregex "SUBFC(8)?o$")
// One of the ALU ops is restricted the other is not so we have a total of
// 5 dispatches.
def : InstRW<[P9_ALU_2C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "F(N)?ABS(D|S)o$"),
(instregex "FCPSGN(D|S)o$"),
// Cracked ALU operations.
// Here the two ALU ops can actually be done in parallel and therefore the
// latencies are not added together. Otherwise this is like having two
-// instructions running together on two pipelines and 4 dispatches.
+// instructions running together on two pipelines and 2 dispatches.
// ALU ops are 3 cycles each.
def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_1C, DISP_1C],
(instrs
MCRFS
)>;
// instructions running together on two pipelines and 6 dispatches.
// ALU ops are 3 cycles each.
def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_3SLOTS_1C],
(instrs
(instregex "MTFSF(b|o)?$"),
(instregex "MTFSFI(o)?$")
// The two ops cannot be done in parallel.
// One of the ALU ops is restricted and takes 3 dispatches.
def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "RLD(I)?C(R|L)o$"),
(instregex "RLW(IMI|INM|NM)(8)?o$"),
// The two ops cannot be done in parallel.
// Both of the ALU ops are restricted and take 3 dispatches.
def : InstRW<[P9_ALU2OpAndALU2Op_6C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_3SLOTS_1C],
(instrs
(instregex "MFFS(L|CE|o)?$")
)>;
// total of 6 cycles. All of the ALU operations are also restricted so each
// takes 3 dispatches for a total of 9.
def : InstRW<[P9_ALUOpAndALUOpAndALUOp_6C, IP_EXEC_1C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
- DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_3SLOTS_1C, DISP_3SLOTS_1C],
(instrs
(instregex "MFCR(8)?$")
)>;
// Cracked instruction made of two ALU ops.
// The two ops cannot be done in parallel.
-def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, DISP_1C, DISP_1C],
(instrs
(instregex "EXTSWSLIo$"),
(instregex "SRAD(I)?o$"),
)>;
// 33 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
-def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_3SLOTS_1C],
(instrs
FDIV
)>;
// 33 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU.
def : InstRW<[P9_DPOpAndALU2Op_36C_8, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
FDIVo
)>;
// 36 Cycle DP Instruction.
// Instruction can be done on a single slice.
-def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_1C],
(instrs
XSSQRTDP
)>;
// 36 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
-def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_3SLOTS_1C],
(instrs
FSQRT
)>;
// 36 Cycle DP Vector Instruction.
def : InstRW<[P9_DPE_36C_10, P9_DPO_36C_10, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C, DISP_1C],
+ DISP_1C],
(instrs
XVSQRTDP
)>;
// 27 Cycle DP Vector Instruction.
def : InstRW<[P9_DPE_27C_10, P9_DPO_27C_10, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C, DISP_1C],
+ DISP_1C],
(instrs
XVSQRTSP
)>;
// 36 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU.
def : InstRW<[P9_DPOpAndALU2Op_39C_10, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
FSQRTo
)>;
// 26 Cycle DP Instruction.
-def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_1C],
(instrs
XSSQRTSP
)>;
// 26 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
-def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_3SLOTS_1C],
(instrs
FSQRTS
)>;
// 26 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU.
def : InstRW<[P9_DPOpAndALU2Op_29C_5, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
FSQRTSo
)>;
-// 33 Cycle DP Instruction. Takes one slice and 2 dispatches.
-def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C],
+// 33 Cycle DP Instruction. Takes one slice and 1 dispatches.
+def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C],
(instrs
XSDIVDP
)>;
// 22 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
-def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_3SLOTS_1C],
(instrs
FDIVS
)>;
// 22 Cycle DP Instruction Restricted and Cracked with 2 Cycle ALU.
def : InstRW<[P9_DPOpAndALU2Op_25C_5, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
FDIVSo
)>;
-// 22 Cycle DP Instruction. Takes one slice and 2 dispatches.
-def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C],
+// 22 Cycle DP Instruction. Takes one slice and 1 dispatches.
+def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C],
(instrs
XSDIVSP
)>;
// 24 Cycle DP Vector Instruction. Takes one full superslice.
-// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given
+// Includes both EXECE, EXECO pipelines and 1 dispatch for the given
// superslice.
def : InstRW<[P9_DPE_24C_8, P9_DPO_24C_8, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C, DISP_1C],
+ DISP_1C],
(instrs
XVDIVSP
)>;
// 33 Cycle DP Vector Instruction. Takes one full superslice.
-// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given
+// Includes both EXECE, EXECO pipelines and 1 dispatch for the given
// superslice.
def : InstRW<[P9_DPE_33C_8, P9_DPO_33C_8, IP_EXECE_1C, IP_EXECO_1C,
- DISP_1C, DISP_1C, DISP_1C],
+ DISP_1C],
(instrs
XVDIVDP
)>;
// The Load and one of the ALU ops cannot be run at the same time and so the
// latencies are added together for 6 cycles. The remainaing ALU is 2 cycles.
// Both the load and the ALU that depends on it are restricted and so they take
-// a total of 6 dispatches. The final 2 dispatches come from the second ALU op.
+// a total of 7 dispatches. The final 2 dispatches come from the second ALU op.
// The two EXEC pipelines are for the 2 ALUs while the AGEN is for the load.
def : InstRW<[P9_LoadAndALU2Op_7C, P9_ALU_2C,
IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "LF(SU|SUX)$")
)>;
// the store and so it can be run at the same time as the store. The store is
// also restricted.
def : InstRW<[P9_LS_1C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "STF(S|D)U(X)?$"),
(instregex "ST(B|H|W|D)U(X)?(8)?$")
// Cracked instruction made up of a Load and an ALU. The ALU does not depend on
// the load and so it can be run at the same time as the load.
def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_PAIR_1C, DISP_PAIR_1C],
(instrs
(instregex "LBZU(X)?(8)?$"),
(instregex "LDU(X)?$")
// are from the ALU. The AGEN pipeline is from the load and the EXEC pipeline
// is required for the ALU.
def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+ DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "LF(DU|DUX)$")
)>;
// Crypto Instructions
// 6 Cycle CY operation. Only one CY unit per CPU so we use a whole
-// superslice. That includes both exec pipelines (EXECO, EXECE) and all three
-// dispatches.
-def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
+// superslice. That includes both exec pipelines (EXECO, EXECE) and one
+// dispatch.
+def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C],
(instrs
(instregex "VPMSUM(B|H|W|D)$"),
(instregex "V(N)?CIPHER(LAST)?$"),
// Branch Instructions
// Two Cycle Branch
-def : InstRW<[P9_BR_2C, DISP_1C, DISP_1C],
+def : InstRW<[P9_BR_2C, DISP_BR_1C],
(instrs
(instregex "BCCCTR(L)?(8)?$"),
(instregex "BCCL(A|R|RL)?$"),
// Five Cycle Branch with a 2 Cycle ALU Op
// Operations must be done consecutively and not in parallel.
-def : InstRW<[P9_BROpAndALUOp_7C, IP_EXEC_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+def : InstRW<[P9_BROpAndALUOp_7C, IP_EXEC_1C, DISP_BR_1C, DISP_1C],
(instrs
ADDPCIS
)>;
// Atomic Load
def : InstRW<[P9_LS_1C, P9_LS_1C, P9_LS_4C, P9_LS_4C, P9_LS_4C,
IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, IP_AGEN_1C, IP_AGEN_1C,
- IP_AGEN_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
- DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
- DISP_1C],
+ IP_AGEN_1C, IP_AGEN_1C, DISP_1C, DISP_3SLOTS_1C,
+ DISP_3SLOTS_1C, DISP_1C, DISP_1C, DISP_1C],
(instrs
(instregex "L(D|W)AT$")
)>;
// Atomic Store
def : InstRW<[P9_LS_1C, P9_LS_4C, P9_LS_4C, IP_EXEC_1C, IP_AGEN_1C, IP_AGEN_1C,
- IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
- DISP_1C],
+ IP_AGEN_1C, DISP_1C, DISP_3SLOTS_1C, DISP_1C],
(instrs
(instregex "ST(D|W)AT$")
)>;
;
; CHECK-P9-LABEL: test16elt:
; CHECK-P9: # %bb.0: # %entry
-; CHECK-P9-NEXT: lxv vs0, 0(r3)
+; CHECK-P9-NEXT: lxv vs2, 0(r3)
+; CHECK-P9-NEXT: xxsldwi vs3, vs2, vs2, 3
+; CHECK-P9-NEXT: xscvspdpn f3, vs3
+; CHECK-P9-NEXT: xscvdpsxws f3, f3
+; CHECK-P9-NEXT: lxv vs0, 48(r3)
+; CHECK-P9-NEXT: lxv vs1, 32(r3)
+; CHECK-P9-NEXT: lxv vs4, 16(r3)
+; CHECK-P9-NEXT: mfvsrwz r3, f3
+; CHECK-P9-NEXT: mtvsrd f3, r3
+; CHECK-P9-NEXT: xxswapd v2, vs3
+; CHECK-P9-NEXT: xxswapd vs3, vs2
+; CHECK-P9-NEXT: xscvspdpn f3, vs3
+; CHECK-P9-NEXT: xscvdpsxws f3, f3
+; CHECK-P9-NEXT: mfvsrwz r3, f3
+; CHECK-P9-NEXT: mtvsrd f3, r3
+; CHECK-P9-NEXT: xxswapd v3, vs3
+; CHECK-P9-NEXT: xscvspdpn f3, vs2
+; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1
+; CHECK-P9-NEXT: xscvspdpn f2, vs2
+; CHECK-P9-NEXT: xscvdpsxws f3, f3
+; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: mfvsrwz r3, f3
+; CHECK-P9-NEXT: mtvsrd f3, r3
+; CHECK-P9-NEXT: mfvsrwz r3, f2
+; CHECK-P9-NEXT: mtvsrd f2, r3
+; CHECK-P9-NEXT: xxswapd v4, vs2
+; CHECK-P9-NEXT: xxsldwi vs2, vs4, vs4, 3
+; CHECK-P9-NEXT: xscvspdpn f2, vs2
+; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: vmrglb v2, v3, v2
+; CHECK-P9-NEXT: xxswapd v3, vs3
+; CHECK-P9-NEXT: vmrglb v3, v3, v4
+; CHECK-P9-NEXT: vmrglh v2, v3, v2
+; CHECK-P9-NEXT: mfvsrwz r3, f2
+; CHECK-P9-NEXT: mtvsrd f2, r3
+; CHECK-P9-NEXT: xxswapd v3, vs2
+; CHECK-P9-NEXT: xxswapd vs2, vs4
+; CHECK-P9-NEXT: xscvspdpn f2, vs2
+; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: mfvsrwz r3, f2
+; CHECK-P9-NEXT: mtvsrd f2, r3
+; CHECK-P9-NEXT: xxswapd v4, vs2
+; CHECK-P9-NEXT: xscvspdpn f2, vs4
+; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: mfvsrwz r3, f2
+; CHECK-P9-NEXT: mtvsrd f2, r3
+; CHECK-P9-NEXT: vmrglb v3, v4, v3
+; CHECK-P9-NEXT: xxswapd v4, vs2
+; CHECK-P9-NEXT: xxsldwi vs2, vs4, vs4, 1
+; CHECK-P9-NEXT: xscvspdpn f2, vs2
+; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: mfvsrwz r3, f2
+; CHECK-P9-NEXT: mtvsrd f2, r3
+; CHECK-P9-NEXT: xxswapd v5, vs2
+; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3
+; CHECK-P9-NEXT: xscvspdpn f2, vs2
+; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: vmrglb v4, v4, v5
+; CHECK-P9-NEXT: vmrglh v3, v4, v3
+; CHECK-P9-NEXT: vmrglw v2, v3, v2
+; CHECK-P9-NEXT: mfvsrwz r3, f2
+; CHECK-P9-NEXT: mtvsrd f2, r3
+; CHECK-P9-NEXT: xxswapd v3, vs2
+; CHECK-P9-NEXT: xxswapd vs2, vs1
+; CHECK-P9-NEXT: xscvspdpn f2, vs2
+; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: mfvsrwz r3, f2
+; CHECK-P9-NEXT: mtvsrd f2, r3
+; CHECK-P9-NEXT: xxswapd v4, vs2
+; CHECK-P9-NEXT: xscvspdpn f2, vs1
+; CHECK-P9-NEXT: xxsldwi vs1, vs1, vs1, 1
+; CHECK-P9-NEXT: xscvspdpn f1, vs1
+; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: xscvdpsxws f1, f1
+; CHECK-P9-NEXT: mfvsrwz r3, f2
+; CHECK-P9-NEXT: mtvsrd f2, r3
+; CHECK-P9-NEXT: mfvsrwz r3, f1
+; CHECK-P9-NEXT: mtvsrd f1, r3
+; CHECK-P9-NEXT: xxswapd v5, vs1
; CHECK-P9-NEXT: xxsldwi vs1, vs0, vs0, 3
; CHECK-P9-NEXT: xscvspdpn f1, vs1
; CHECK-P9-NEXT: xscvdpsxws f1, f1
-; CHECK-P9-NEXT: mfvsrwz r4, f1
-; CHECK-P9-NEXT: mtvsrd f1, r4
-; CHECK-P9-NEXT: xxswapd v2, vs1
+; CHECK-P9-NEXT: vmrglb v3, v4, v3
+; CHECK-P9-NEXT: xxswapd v4, vs2
+; CHECK-P9-NEXT: vmrglb v4, v4, v5
+; CHECK-P9-NEXT: vmrglh v3, v4, v3
+; CHECK-P9-NEXT: mfvsrwz r3, f1
+; CHECK-P9-NEXT: mtvsrd f1, r3
+; CHECK-P9-NEXT: xxswapd v4, vs1
; CHECK-P9-NEXT: xxswapd vs1, vs0
; CHECK-P9-NEXT: xscvspdpn f1, vs1
; CHECK-P9-NEXT: xscvdpsxws f1, f1
-; CHECK-P9-NEXT: lxv vs2, 48(r3)
-; CHECK-P9-NEXT: lxv vs3, 32(r3)
-; CHECK-P9-NEXT: lxv vs4, 16(r3)
; CHECK-P9-NEXT: mfvsrwz r3, f1
; CHECK-P9-NEXT: mtvsrd f1, r3
-; CHECK-P9-NEXT: xxswapd v3, vs1
+; CHECK-P9-NEXT: xxswapd v5, vs1
; CHECK-P9-NEXT: xscvspdpn f1, vs0
; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1
; CHECK-P9-NEXT: xscvspdpn f0, vs0
; CHECK-P9-NEXT: mtvsrd f1, r3
; CHECK-P9-NEXT: mfvsrwz r3, f0
; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: xxswapd v4, vs0
-; CHECK-P9-NEXT: xxsldwi vs0, vs4, vs4, 3
-; CHECK-P9-NEXT: xscvspdpn f0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: vmrglb v2, v3, v2
-; CHECK-P9-NEXT: xxswapd v3, vs1
-; CHECK-P9-NEXT: vmrglb v3, v3, v4
-; CHECK-P9-NEXT: vmrglh v2, v3, v2
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: xxswapd v3, vs0
-; CHECK-P9-NEXT: xxswapd vs0, vs4
-; CHECK-P9-NEXT: xscvspdpn f0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: xxswapd v4, vs0
-; CHECK-P9-NEXT: xscvspdpn f0, vs4
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: vmrglb v3, v4, v3
-; CHECK-P9-NEXT: xxswapd v4, vs0
-; CHECK-P9-NEXT: xxsldwi vs0, vs4, vs4, 1
-; CHECK-P9-NEXT: xscvspdpn f0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: xxswapd v5, vs0
-; CHECK-P9-NEXT: xxsldwi vs0, vs3, vs3, 3
-; CHECK-P9-NEXT: xscvspdpn f0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: vmrglb v4, v4, v5
-; CHECK-P9-NEXT: vmrglh v3, v4, v3
-; CHECK-P9-NEXT: vmrglw v2, v3, v2
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: xxswapd v3, vs0
-; CHECK-P9-NEXT: xxswapd vs0, vs3
-; CHECK-P9-NEXT: xscvspdpn f0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: xxswapd v4, vs0
-; CHECK-P9-NEXT: xscvspdpn f0, vs3
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: vmrglb v3, v4, v3
-; CHECK-P9-NEXT: xxswapd v4, vs0
-; CHECK-P9-NEXT: xxsldwi vs0, vs3, vs3, 1
-; CHECK-P9-NEXT: xscvspdpn f0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: xxswapd v5, vs0
-; CHECK-P9-NEXT: xxsldwi vs0, vs2, vs2, 3
-; CHECK-P9-NEXT: xscvspdpn f0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: vmrglb v4, v4, v5
-; CHECK-P9-NEXT: vmrglh v3, v4, v3
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: xxswapd v4, vs0
-; CHECK-P9-NEXT: xxswapd vs0, vs2
-; CHECK-P9-NEXT: xscvspdpn f0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: xxswapd v5, vs0
-; CHECK-P9-NEXT: xscvspdpn f0, vs2
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
; CHECK-P9-NEXT: vmrglb v4, v5, v4
-; CHECK-P9-NEXT: xxswapd v5, vs0
-; CHECK-P9-NEXT: xxsldwi vs0, vs2, vs2, 1
-; CHECK-P9-NEXT: xscvspdpn f0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
+; CHECK-P9-NEXT: xxswapd v5, vs1
; CHECK-P9-NEXT: xxswapd v0, vs0
; CHECK-P9-NEXT: vmrglb v5, v5, v0
; CHECK-P9-NEXT: vmrglh v4, v5, v4
;
; CHECK-P9-LABEL: test16elt_signed:
; CHECK-P9: # %bb.0: # %entry
-; CHECK-P9-NEXT: lxv vs0, 0(r3)
+; CHECK-P9-NEXT: lxv vs2, 0(r3)
+; CHECK-P9-NEXT: xxsldwi vs3, vs2, vs2, 3
+; CHECK-P9-NEXT: xscvspdpn f3, vs3
+; CHECK-P9-NEXT: xscvdpsxws f3, f3
+; CHECK-P9-NEXT: lxv vs0, 48(r3)
+; CHECK-P9-NEXT: lxv vs1, 32(r3)
+; CHECK-P9-NEXT: lxv vs4, 16(r3)
+; CHECK-P9-NEXT: mfvsrwz r3, f3
+; CHECK-P9-NEXT: mtvsrd f3, r3
+; CHECK-P9-NEXT: xxswapd v2, vs3
+; CHECK-P9-NEXT: xxswapd vs3, vs2
+; CHECK-P9-NEXT: xscvspdpn f3, vs3
+; CHECK-P9-NEXT: xscvdpsxws f3, f3
+; CHECK-P9-NEXT: mfvsrwz r3, f3
+; CHECK-P9-NEXT: mtvsrd f3, r3
+; CHECK-P9-NEXT: xxswapd v3, vs3
+; CHECK-P9-NEXT: xscvspdpn f3, vs2
+; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1
+; CHECK-P9-NEXT: xscvspdpn f2, vs2
+; CHECK-P9-NEXT: xscvdpsxws f3, f3
+; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: mfvsrwz r3, f3
+; CHECK-P9-NEXT: mtvsrd f3, r3
+; CHECK-P9-NEXT: mfvsrwz r3, f2
+; CHECK-P9-NEXT: mtvsrd f2, r3
+; CHECK-P9-NEXT: xxswapd v4, vs2
+; CHECK-P9-NEXT: xxsldwi vs2, vs4, vs4, 3
+; CHECK-P9-NEXT: xscvspdpn f2, vs2
+; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: vmrglb v2, v3, v2
+; CHECK-P9-NEXT: xxswapd v3, vs3
+; CHECK-P9-NEXT: vmrglb v3, v3, v4
+; CHECK-P9-NEXT: vmrglh v2, v3, v2
+; CHECK-P9-NEXT: mfvsrwz r3, f2
+; CHECK-P9-NEXT: mtvsrd f2, r3
+; CHECK-P9-NEXT: xxswapd v3, vs2
+; CHECK-P9-NEXT: xxswapd vs2, vs4
+; CHECK-P9-NEXT: xscvspdpn f2, vs2
+; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: mfvsrwz r3, f2
+; CHECK-P9-NEXT: mtvsrd f2, r3
+; CHECK-P9-NEXT: xxswapd v4, vs2
+; CHECK-P9-NEXT: xscvspdpn f2, vs4
+; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: mfvsrwz r3, f2
+; CHECK-P9-NEXT: mtvsrd f2, r3
+; CHECK-P9-NEXT: vmrglb v3, v4, v3
+; CHECK-P9-NEXT: xxswapd v4, vs2
+; CHECK-P9-NEXT: xxsldwi vs2, vs4, vs4, 1
+; CHECK-P9-NEXT: xscvspdpn f2, vs2
+; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: mfvsrwz r3, f2
+; CHECK-P9-NEXT: mtvsrd f2, r3
+; CHECK-P9-NEXT: xxswapd v5, vs2
+; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3
+; CHECK-P9-NEXT: xscvspdpn f2, vs2
+; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: vmrglb v4, v4, v5
+; CHECK-P9-NEXT: vmrglh v3, v4, v3
+; CHECK-P9-NEXT: vmrglw v2, v3, v2
+; CHECK-P9-NEXT: mfvsrwz r3, f2
+; CHECK-P9-NEXT: mtvsrd f2, r3
+; CHECK-P9-NEXT: xxswapd v3, vs2
+; CHECK-P9-NEXT: xxswapd vs2, vs1
+; CHECK-P9-NEXT: xscvspdpn f2, vs2
+; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: mfvsrwz r3, f2
+; CHECK-P9-NEXT: mtvsrd f2, r3
+; CHECK-P9-NEXT: xxswapd v4, vs2
+; CHECK-P9-NEXT: xscvspdpn f2, vs1
+; CHECK-P9-NEXT: xxsldwi vs1, vs1, vs1, 1
+; CHECK-P9-NEXT: xscvspdpn f1, vs1
+; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: xscvdpsxws f1, f1
+; CHECK-P9-NEXT: mfvsrwz r3, f2
+; CHECK-P9-NEXT: mtvsrd f2, r3
+; CHECK-P9-NEXT: mfvsrwz r3, f1
+; CHECK-P9-NEXT: mtvsrd f1, r3
+; CHECK-P9-NEXT: xxswapd v5, vs1
; CHECK-P9-NEXT: xxsldwi vs1, vs0, vs0, 3
; CHECK-P9-NEXT: xscvspdpn f1, vs1
; CHECK-P9-NEXT: xscvdpsxws f1, f1
-; CHECK-P9-NEXT: mfvsrwz r4, f1
-; CHECK-P9-NEXT: mtvsrd f1, r4
-; CHECK-P9-NEXT: xxswapd v2, vs1
+; CHECK-P9-NEXT: vmrglb v3, v4, v3
+; CHECK-P9-NEXT: xxswapd v4, vs2
+; CHECK-P9-NEXT: vmrglb v4, v4, v5
+; CHECK-P9-NEXT: vmrglh v3, v4, v3
+; CHECK-P9-NEXT: mfvsrwz r3, f1
+; CHECK-P9-NEXT: mtvsrd f1, r3
+; CHECK-P9-NEXT: xxswapd v4, vs1
; CHECK-P9-NEXT: xxswapd vs1, vs0
; CHECK-P9-NEXT: xscvspdpn f1, vs1
; CHECK-P9-NEXT: xscvdpsxws f1, f1
-; CHECK-P9-NEXT: lxv vs2, 48(r3)
-; CHECK-P9-NEXT: lxv vs3, 32(r3)
-; CHECK-P9-NEXT: lxv vs4, 16(r3)
; CHECK-P9-NEXT: mfvsrwz r3, f1
; CHECK-P9-NEXT: mtvsrd f1, r3
-; CHECK-P9-NEXT: xxswapd v3, vs1
+; CHECK-P9-NEXT: xxswapd v5, vs1
; CHECK-P9-NEXT: xscvspdpn f1, vs0
; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1
; CHECK-P9-NEXT: xscvspdpn f0, vs0
; CHECK-P9-NEXT: mtvsrd f1, r3
; CHECK-P9-NEXT: mfvsrwz r3, f0
; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: xxswapd v4, vs0
-; CHECK-P9-NEXT: xxsldwi vs0, vs4, vs4, 3
-; CHECK-P9-NEXT: xscvspdpn f0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: vmrglb v2, v3, v2
-; CHECK-P9-NEXT: xxswapd v3, vs1
-; CHECK-P9-NEXT: vmrglb v3, v3, v4
-; CHECK-P9-NEXT: vmrglh v2, v3, v2
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: xxswapd v3, vs0
-; CHECK-P9-NEXT: xxswapd vs0, vs4
-; CHECK-P9-NEXT: xscvspdpn f0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: xxswapd v4, vs0
-; CHECK-P9-NEXT: xscvspdpn f0, vs4
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: vmrglb v3, v4, v3
-; CHECK-P9-NEXT: xxswapd v4, vs0
-; CHECK-P9-NEXT: xxsldwi vs0, vs4, vs4, 1
-; CHECK-P9-NEXT: xscvspdpn f0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: xxswapd v5, vs0
-; CHECK-P9-NEXT: xxsldwi vs0, vs3, vs3, 3
-; CHECK-P9-NEXT: xscvspdpn f0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: vmrglb v4, v4, v5
-; CHECK-P9-NEXT: vmrglh v3, v4, v3
-; CHECK-P9-NEXT: vmrglw v2, v3, v2
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: xxswapd v3, vs0
-; CHECK-P9-NEXT: xxswapd vs0, vs3
-; CHECK-P9-NEXT: xscvspdpn f0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: xxswapd v4, vs0
-; CHECK-P9-NEXT: xscvspdpn f0, vs3
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: vmrglb v3, v4, v3
-; CHECK-P9-NEXT: xxswapd v4, vs0
-; CHECK-P9-NEXT: xxsldwi vs0, vs3, vs3, 1
-; CHECK-P9-NEXT: xscvspdpn f0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: xxswapd v5, vs0
-; CHECK-P9-NEXT: xxsldwi vs0, vs2, vs2, 3
-; CHECK-P9-NEXT: xscvspdpn f0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: vmrglb v4, v4, v5
-; CHECK-P9-NEXT: vmrglh v3, v4, v3
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: xxswapd v4, vs0
-; CHECK-P9-NEXT: xxswapd vs0, vs2
-; CHECK-P9-NEXT: xscvspdpn f0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
-; CHECK-P9-NEXT: xxswapd v5, vs0
-; CHECK-P9-NEXT: xscvspdpn f0, vs2
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
; CHECK-P9-NEXT: vmrglb v4, v5, v4
-; CHECK-P9-NEXT: xxswapd v5, vs0
-; CHECK-P9-NEXT: xxsldwi vs0, vs2, vs2, 1
-; CHECK-P9-NEXT: xscvspdpn f0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mfvsrwz r3, f0
-; CHECK-P9-NEXT: mtvsrd f0, r3
+; CHECK-P9-NEXT: xxswapd v5, vs1
; CHECK-P9-NEXT: xxswapd v0, vs0
; CHECK-P9-NEXT: vmrglb v5, v5, v0
; CHECK-P9-NEXT: vmrglh v4, v5, v4