rs6000: Add MMA built-in function definitions and test cases.
authorPeter Bergner <bergner@linux.ibm.com>
Sun, 21 Jun 2020 04:23:02 +0000 (23:23 -0500)
committerPeter Bergner <bergner@linux.ibm.com>
Sun, 21 Jun 2020 05:26:13 +0000 (00:26 -0500)
Add the Matrix-Multiply Assist (MMA) built-ins.  The MMA accumulators are
INOUT operands for most MMA instructions, but they are also very expensive
to move around.  For this reason, we have implemented a built-in API where
the accumulators are passed using pass-by-reference/pointers, so the user
won't use one accumulator as input and another as output, which wouldentail
a lot of copies.  However, using pointers gives us poor code generation
when we expand the built-ins at normal expand time.  We therefore expand
the MMA built-ins early into gimple, converting the pass-by-reference calls
to an internal built-in that uses pass-by-value calling convention, where
we can enforce the input and output accumulators are the same.  This gives
us much better code generation.

2020-06-20  Peter Bergner  <bergner@linux.ibm.com>

gcc/
* config/rs6000/predicates.md (mma_assemble_input_operand): New.
* config/rs6000/rs6000-builtin.def (BU_MMA_1, BU_MMA_V2, BU_MMA_3,
BU_MMA_5, BU_MMA_6, BU_VSX_1): Add support macros for defining MMA
built-in functions.
(ASSEMBLE_ACC, ASSEMBLE_PAIR, DISASSEMBLE_ACC, DISASSEMBLE_PAIR,
PMXVBF16GER2, PMXVBF16GER2NN, PMXVBF16GER2NP, PMXVBF16GER2PN,
PMXVBF16GER2PP, PMXVF16GER2, PMXVF16GER2NN, PMXVF16GER2NP,
PMXVF16GER2PN, PMXVF16GER2PP, PMXVF32GER, PMXVF32GERNN,
PMXVF32GERNP, PMXVF32GERPN, PMXVF32GERPP, PMXVF64GER, PMXVF64GERNN,
PMXVF64GERNP, PMXVF64GERPN, PMXVF64GERPP, PMXVI16GER2, PMXVI16GER2PP,
PMXVI16GER2S, PMXVI16GER2SPP, PMXVI4GER8, PMXVI4GER8PP, PMXVI8GER4,
PMXVI8GER4PP, PMXVI8GER4SPP, XVBF16GER2, XVBF16GER2NN, XVBF16GER2NP,
XVBF16GER2PN, XVBF16GER2PP, XVCVBF16SP, XVCVSPBF16, XVF16GER2,
XVF16GER2NN, XVF16GER2NP, XVF16GER2PN, XVF16GER2PP, XVF32GER,
XVF32GERNN, XVF32GERNP, XVF32GERPN, XVF32GERPP, XVF64GER, XVF64GERNN,
XVF64GERNP, XVF64GERPN, XVF64GERPP, XVI16GER2, XVI16GER2PP, XVI16GER2S,
XVI16GER2SPP, XVI4GER8, XVI4GER8PP, XVI8GER4, XVI8GER4PP, XVI8GER4SPP,
XXMFACC, XXMTACC, XXSETACCZ): Add MMA built-ins.
* config/rs6000/rs6000.c (rs6000_emit_move): Use CONST_INT_P.
Allow zero constants.
(print_operand) <case 'A'>: New output modifier.
(rs6000_split_multireg_move): Add support for inserting accumulator
priming and depriming instructions.  Add support for splitting an
assemble accumulator pattern.
* config/rs6000/rs6000-call.c (mma_init_builtins, mma_expand_builtin,
rs6000_gimple_fold_mma_builtin): New functions.
(RS6000_BUILTIN_M): New macro.
(def_builtin): Handle RS6000_BTC_QUAD and RS6000_BTC_PAIR attributes.
(bdesc_mma): Add new MMA built-in support.
(htm_expand_builtin): Use RS6000_BTC_OPND_MASK.
(rs6000_invalid_builtin): Add handling of RS6000_BTM_FUTURE and
RS6000_BTM_MMA.
(rs6000_builtin_valid_without_lhs): Handle RS6000_BTC_VOID attribute.
(rs6000_gimple_fold_builtin): Call rs6000_builtin_is_supported_p
and rs6000_gimple_fold_mma_builtin.
(rs6000_expand_builtin): Call mma_expand_builtin.
Use RS6000_BTC_OPND_MASK.
(rs6000_init_builtins): Adjust comment.  Call mma_init_builtins.
(htm_init_builtins): Use RS6000_BTC_OPND_MASK.
(builtin_function_type): Handle VSX_BUILTIN_XVCVSPBF16 and
VSX_BUILTIN_XVCVBF16SP.
* config/rs6000/rs6000.h (RS6000_BTC_QUINARY, RS6000_BTC_SENARY,
RS6000_BTC_OPND_MASK, RS6000_BTC_QUAD, RS6000_BTC_PAIR,
RS6000_BTC_QUADPAIR, RS6000_BTC_GIMPLE): New defines.
(RS6000_BTC_PREDICATE, RS6000_BTC_ABS, RS6000_BTC_DST,
RS6000_BTC_TYPE_MASK, RS6000_BTC_ATTR_MASK): Adjust values.
* config/rs6000/mma.md (MAX_MMA_OPERANDS): New define_constant.
(UNSPEC_MMA_ASSEMBLE_ACC, UNSPEC_MMA_PMXVBF16GER2,
UNSPEC_MMA_PMXVBF16GER2NN, UNSPEC_MMA_PMXVBF16GER2NP,
UNSPEC_MMA_PMXVBF16GER2PN, UNSPEC_MMA_PMXVBF16GER2PP,
UNSPEC_MMA_PMXVF16GER2, UNSPEC_MMA_PMXVF16GER2NN,
UNSPEC_MMA_PMXVF16GER2NP, UNSPEC_MMA_PMXVF16GER2PN,
UNSPEC_MMA_PMXVF16GER2PP, UNSPEC_MMA_PMXVF32GER,
UNSPEC_MMA_PMXVF32GERNN, UNSPEC_MMA_PMXVF32GERNP,
UNSPEC_MMA_PMXVF32GERPN, UNSPEC_MMA_PMXVF32GERPP,
UNSPEC_MMA_PMXVF64GER, UNSPEC_MMA_PMXVF64GERNN,
UNSPEC_MMA_PMXVF64GERNP, UNSPEC_MMA_PMXVF64GERPN,
UNSPEC_MMA_PMXVF64GERPP, UNSPEC_MMA_PMXVI16GER2,
UNSPEC_MMA_PMXVI16GER2PP, UNSPEC_MMA_PMXVI16GER2S,
UNSPEC_MMA_PMXVI16GER2SPP, UNSPEC_MMA_PMXVI4GER8,
UNSPEC_MMA_PMXVI4GER8PP, UNSPEC_MMA_PMXVI8GER4,
UNSPEC_MMA_PMXVI8GER4PP, UNSPEC_MMA_PMXVI8GER4SPP,
UNSPEC_MMA_XVBF16GER2, UNSPEC_MMA_XVBF16GER2NN,
UNSPEC_MMA_XVBF16GER2NP, UNSPEC_MMA_XVBF16GER2PN,
UNSPEC_MMA_XVBF16GER2PP, UNSPEC_MMA_XVF16GER2, UNSPEC_MMA_XVF16GER2NN,
UNSPEC_MMA_XVF16GER2NP, UNSPEC_MMA_XVF16GER2PN, UNSPEC_MMA_XVF16GER2PP,
UNSPEC_MMA_XVF32GER, UNSPEC_MMA_XVF32GERNN, UNSPEC_MMA_XVF32GERNP,
UNSPEC_MMA_XVF32GERPN, UNSPEC_MMA_XVF32GERPP, UNSPEC_MMA_XVF64GER,
UNSPEC_MMA_XVF64GERNN, UNSPEC_MMA_XVF64GERNP, UNSPEC_MMA_XVF64GERPN,
UNSPEC_MMA_XVF64GERPP, UNSPEC_MMA_XVI16GER2, UNSPEC_MMA_XVI16GER2PP,
UNSPEC_MMA_XVI16GER2S, UNSPEC_MMA_XVI16GER2SPP, UNSPEC_MMA_XVI4GER8,
UNSPEC_MMA_XVI4GER8PP, UNSPEC_MMA_XVI8GER4, UNSPEC_MMA_XVI8GER4PP,
UNSPEC_MMA_XVI8GER4SPP, UNSPEC_MMA_XXMFACC, UNSPEC_MMA_XXMTACC): New.
(MMA_ACC, MMA_VV, MMA_AVV, MMA_PV, MMA_APV, MMA_VVI4I4I8,
MMA_AVVI4I4I8, MMA_VVI4I4I2, MMA_AVVI4I4I2, MMA_VVI4I4,
MMA_AVVI4I4, MMA_PVI4I2, MMA_APVI4I2, MMA_VVI4I4I4,
MMA_AVVI4I4I4): New define_int_iterator.
(acc, vv, avv, pv, apv, vvi4i4i8, avvi4i4i8, vvi4i4i2,
avvi4i4i2, vvi4i4, avvi4i4, pvi4i2, apvi4i2, vvi4i4i4,
avvi4i4i4): New define_int_attr.
(*movpxi): Add zero constant alternative.
(mma_assemble_pair, mma_assemble_acc): New define_expand.
(*mma_assemble_acc): New define_insn_and_split.
(mma_<acc>, mma_xxsetaccz, mma_<vv>, mma_<avv>, mma_<pv>, mma_<apv>,
mma_<vvi4i4i8>, mma_<avvi4i4i8>, mma_<vvi4i4i2>, mma_<avvi4i4i2>,
mma_<vvi4i4>, mma_<avvi4i4>, mma_<pvi4i2>, mma_<apvi4i2>,
mma_<vvi4i4i4>, mma_<avvi4i4i4>): New define_insn.
* config/rs6000/rs6000.md (define_attr "type"): New type mma.
* config/rs6000/vsx.md (UNSPEC_VSX_XVCVBF16SP): New.
(UNSPEC_VSX_XVCVSPBF16): Likewise.
(XVCVBF16): New define_int_iterator.
(xvcvbf16): New define_int_attr.
(vsx_<xvcvbf16>): New define_insn.
* doc/extend.texi: Document the mma built-ins.

15 files changed:
gcc/config/rs6000/mma.md
gcc/config/rs6000/predicates.md
gcc/config/rs6000/rs6000-builtin.def
gcc/config/rs6000/rs6000-call.c
gcc/config/rs6000/rs6000.c
gcc/config/rs6000/rs6000.h
gcc/config/rs6000/rs6000.md
gcc/config/rs6000/vsx.md
gcc/doc/extend.texi
gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/powerpc/mma-builtin-2.c [new file with mode: 0644]
gcc/testsuite/gcc.target/powerpc/mma-builtin-3.c [new file with mode: 0644]
gcc/testsuite/gcc.target/powerpc/mma-builtin-4.c [new file with mode: 0644]
gcc/testsuite/gcc.target/powerpc/mma-builtin-5.c [new file with mode: 0644]
gcc/testsuite/gcc.target/powerpc/mma-builtin-6.c [new file with mode: 0644]

index 6e4cfca..15cacfb 100644 (file)
 ;; therefore, we define the XImode and OImode move patterns, but we
 ;; disable their use with a "false" condition flag.
 
+(define_constants [(MAX_MMA_OPERANDS 7)])
+
+;; Constants for creating unspecs
+
+(define_c_enum "unspec"
+  [UNSPEC_MMA_ASSEMBLE_ACC
+   UNSPEC_MMA_PMXVBF16GER2
+   UNSPEC_MMA_PMXVBF16GER2NN
+   UNSPEC_MMA_PMXVBF16GER2NP
+   UNSPEC_MMA_PMXVBF16GER2PN
+   UNSPEC_MMA_PMXVBF16GER2PP
+   UNSPEC_MMA_PMXVF16GER2
+   UNSPEC_MMA_PMXVF16GER2NN
+   UNSPEC_MMA_PMXVF16GER2NP
+   UNSPEC_MMA_PMXVF16GER2PN
+   UNSPEC_MMA_PMXVF16GER2PP
+   UNSPEC_MMA_PMXVF32GER
+   UNSPEC_MMA_PMXVF32GERNN
+   UNSPEC_MMA_PMXVF32GERNP
+   UNSPEC_MMA_PMXVF32GERPN
+   UNSPEC_MMA_PMXVF32GERPP
+   UNSPEC_MMA_PMXVF64GER
+   UNSPEC_MMA_PMXVF64GERNN
+   UNSPEC_MMA_PMXVF64GERNP
+   UNSPEC_MMA_PMXVF64GERPN
+   UNSPEC_MMA_PMXVF64GERPP
+   UNSPEC_MMA_PMXVI16GER2
+   UNSPEC_MMA_PMXVI16GER2PP
+   UNSPEC_MMA_PMXVI16GER2S
+   UNSPEC_MMA_PMXVI16GER2SPP
+   UNSPEC_MMA_PMXVI4GER8
+   UNSPEC_MMA_PMXVI4GER8PP
+   UNSPEC_MMA_PMXVI8GER4
+   UNSPEC_MMA_PMXVI8GER4PP
+   UNSPEC_MMA_PMXVI8GER4SPP
+   UNSPEC_MMA_XVBF16GER2
+   UNSPEC_MMA_XVBF16GER2NN
+   UNSPEC_MMA_XVBF16GER2NP
+   UNSPEC_MMA_XVBF16GER2PN
+   UNSPEC_MMA_XVBF16GER2PP
+   UNSPEC_MMA_XVF16GER2
+   UNSPEC_MMA_XVF16GER2NN
+   UNSPEC_MMA_XVF16GER2NP
+   UNSPEC_MMA_XVF16GER2PN
+   UNSPEC_MMA_XVF16GER2PP
+   UNSPEC_MMA_XVF32GER
+   UNSPEC_MMA_XVF32GERNN
+   UNSPEC_MMA_XVF32GERNP
+   UNSPEC_MMA_XVF32GERPN
+   UNSPEC_MMA_XVF32GERPP
+   UNSPEC_MMA_XVF64GER
+   UNSPEC_MMA_XVF64GERNN
+   UNSPEC_MMA_XVF64GERNP
+   UNSPEC_MMA_XVF64GERPN
+   UNSPEC_MMA_XVF64GERPP
+   UNSPEC_MMA_XVI16GER2
+   UNSPEC_MMA_XVI16GER2PP
+   UNSPEC_MMA_XVI16GER2S
+   UNSPEC_MMA_XVI16GER2SPP
+   UNSPEC_MMA_XVI4GER8
+   UNSPEC_MMA_XVI4GER8PP
+   UNSPEC_MMA_XVI8GER4
+   UNSPEC_MMA_XVI8GER4PP
+   UNSPEC_MMA_XVI8GER4SPP
+   UNSPEC_MMA_XXMFACC
+   UNSPEC_MMA_XXMTACC
+  ])
+
+;; MMA instructions with 1 accumulator argument
+(define_int_iterator MMA_ACC           [UNSPEC_MMA_XXMFACC
+                                        UNSPEC_MMA_XXMTACC])
+
+;; MMA instructions with 2 vector arguments
+(define_int_iterator MMA_VV            [UNSPEC_MMA_XVI4GER8
+                                        UNSPEC_MMA_XVI8GER4
+                                        UNSPEC_MMA_XVI16GER2
+                                        UNSPEC_MMA_XVI16GER2S
+                                        UNSPEC_MMA_XVF16GER2
+                                        UNSPEC_MMA_XVBF16GER2
+                                        UNSPEC_MMA_XVF32GER])
+
+;; MMA instructions with 1 accumulator and 2 vector arguments
+(define_int_iterator MMA_AVV           [UNSPEC_MMA_XVI4GER8PP
+                                        UNSPEC_MMA_XVI8GER4PP
+                                        UNSPEC_MMA_XVI8GER4SPP
+                                        UNSPEC_MMA_XVI16GER2PP
+                                        UNSPEC_MMA_XVI16GER2SPP
+                                        UNSPEC_MMA_XVF16GER2PP
+                                        UNSPEC_MMA_XVF16GER2PN
+                                        UNSPEC_MMA_XVF16GER2NP
+                                        UNSPEC_MMA_XVF16GER2NN
+                                        UNSPEC_MMA_XVBF16GER2PP
+                                        UNSPEC_MMA_XVBF16GER2PN
+                                        UNSPEC_MMA_XVBF16GER2NP
+                                        UNSPEC_MMA_XVBF16GER2NN
+                                        UNSPEC_MMA_XVF32GERPP
+                                        UNSPEC_MMA_XVF32GERPN
+                                        UNSPEC_MMA_XVF32GERNP
+                                        UNSPEC_MMA_XVF32GERNN])
+
+;; MMA instructions with 1 vector pair and 1 vector arguments
+(define_int_iterator MMA_PV            [UNSPEC_MMA_XVF64GER])
+
+;; MMA instructions with 1 accumulator, 1 vector pair and 1 vector arguments
+(define_int_iterator MMA_APV           [UNSPEC_MMA_XVF64GERPP
+                                        UNSPEC_MMA_XVF64GERPN
+                                        UNSPEC_MMA_XVF64GERNP
+                                        UNSPEC_MMA_XVF64GERNN])
+
+;; MMA instructions with 2 vector, 2 4-bit and 1 8-bit arguments
+(define_int_iterator MMA_VVI4I4I8      [UNSPEC_MMA_PMXVI4GER8])
+
+;; MMA instructions with 1 accumulator, 2 vector, 2 4-bit and 1 8-bit arguments
+(define_int_iterator MMA_AVVI4I4I8     [UNSPEC_MMA_PMXVI4GER8PP])
+
+;; MMA instructions with 2 vector, 2 4-bit and 1 2-bit arguments
+(define_int_iterator MMA_VVI4I4I2      [UNSPEC_MMA_PMXVI16GER2
+                                        UNSPEC_MMA_PMXVI16GER2S
+                                        UNSPEC_MMA_PMXVF16GER2
+                                        UNSPEC_MMA_PMXVBF16GER2])
+
+;; MMA instructions with 1 accumulator, 2 vector, 2 4-bit and 1 2-bit arguments
+(define_int_iterator MMA_AVVI4I4I2     [UNSPEC_MMA_PMXVI16GER2PP
+                                        UNSPEC_MMA_PMXVI16GER2SPP
+                                        UNSPEC_MMA_PMXVF16GER2PP
+                                        UNSPEC_MMA_PMXVF16GER2PN
+                                        UNSPEC_MMA_PMXVF16GER2NP
+                                        UNSPEC_MMA_PMXVF16GER2NN
+                                        UNSPEC_MMA_PMXVBF16GER2PP
+                                        UNSPEC_MMA_PMXVBF16GER2PN
+                                        UNSPEC_MMA_PMXVBF16GER2NP
+                                        UNSPEC_MMA_PMXVBF16GER2NN])
+
+;; MMA instructions with 2 vector and 2 4-bit arguments
+(define_int_iterator MMA_VVI4I4                [UNSPEC_MMA_PMXVF32GER])
+
+;; MMA instructions with 1 accumulator, 2 vector and 2 4-bit arguments
+(define_int_iterator MMA_AVVI4I4       [UNSPEC_MMA_PMXVF32GERPP
+                                        UNSPEC_MMA_PMXVF32GERPN
+                                        UNSPEC_MMA_PMXVF32GERNP
+                                        UNSPEC_MMA_PMXVF32GERNN])
+
+;; MMA instructions with 2 vector, 1 4-bit and 1 2-bit arguments
+(define_int_iterator MMA_PVI4I2                [UNSPEC_MMA_PMXVF64GER])
+
+;; MMA instructions with 1 accumulator, 2 vector, 1 4-bit and 1 2-bit arguments
+(define_int_iterator MMA_APVI4I2       [UNSPEC_MMA_PMXVF64GERPP
+                                        UNSPEC_MMA_PMXVF64GERPN
+                                        UNSPEC_MMA_PMXVF64GERNP
+                                        UNSPEC_MMA_PMXVF64GERNN])
+
+;; MMA instructions with 2 vector and 3 4-bit arguments
+(define_int_iterator MMA_VVI4I4I4      [UNSPEC_MMA_PMXVI8GER4])
+
+;; MMA instructions with 1 accumulator, 2 vector and 3 4-bit arguments
+(define_int_iterator MMA_AVVI4I4I4     [UNSPEC_MMA_PMXVI8GER4PP
+                                        UNSPEC_MMA_PMXVI8GER4SPP])
+
+(define_int_attr acc           [(UNSPEC_MMA_XXMFACC            "xxmfacc")
+                                (UNSPEC_MMA_XXMTACC            "xxmtacc")])
+
+(define_int_attr vv            [(UNSPEC_MMA_XVI4GER8           "xvi4ger8")
+                                (UNSPEC_MMA_XVI8GER4           "xvi8ger4")
+                                (UNSPEC_MMA_XVI16GER2          "xvi16ger2")
+                                (UNSPEC_MMA_XVI16GER2S         "xvi16ger2s")
+                                (UNSPEC_MMA_XVF16GER2          "xvf16ger2")
+                                (UNSPEC_MMA_XVBF16GER2         "xvbf16ger2")
+                                (UNSPEC_MMA_XVF32GER           "xvf32ger")])
+
+(define_int_attr avv           [(UNSPEC_MMA_XVI4GER8PP         "xvi4ger8pp")
+                                (UNSPEC_MMA_XVI8GER4PP         "xvi8ger4pp")
+                                (UNSPEC_MMA_XVI8GER4SPP        "xvi8ger4spp")
+                                (UNSPEC_MMA_XVI16GER2PP        "xvi16ger2pp")
+                                (UNSPEC_MMA_XVI16GER2SPP       "xvi16ger2spp")
+                                (UNSPEC_MMA_XVF16GER2PP        "xvf16ger2pp")
+                                (UNSPEC_MMA_XVF16GER2PN        "xvf16ger2pn")
+                                (UNSPEC_MMA_XVF16GER2NP        "xvf16ger2np")
+                                (UNSPEC_MMA_XVF16GER2NN        "xvf16ger2nn")
+                                (UNSPEC_MMA_XVBF16GER2PP       "xvbf16ger2pp")
+                                (UNSPEC_MMA_XVBF16GER2PN       "xvbf16ger2pn")
+                                (UNSPEC_MMA_XVBF16GER2NP       "xvbf16ger2np")
+                                (UNSPEC_MMA_XVBF16GER2NN       "xvbf16ger2nn")
+                                (UNSPEC_MMA_XVF32GERPP         "xvf32gerpp")
+                                (UNSPEC_MMA_XVF32GERPN         "xvf32gerpn")
+                                (UNSPEC_MMA_XVF32GERNP         "xvf32gernp")
+                                (UNSPEC_MMA_XVF32GERNN         "xvf32gernn")])
+
+(define_int_attr pv            [(UNSPEC_MMA_XVF64GER           "xvf64ger")])
+
+(define_int_attr apv           [(UNSPEC_MMA_XVF64GERPP         "xvf64gerpp")
+                                (UNSPEC_MMA_XVF64GERPN         "xvf64gerpn")
+                                (UNSPEC_MMA_XVF64GERNP         "xvf64gernp")
+                                (UNSPEC_MMA_XVF64GERNN         "xvf64gernn")])
+
+(define_int_attr vvi4i4i8      [(UNSPEC_MMA_PMXVI4GER8         "pmxvi4ger8")])
+
+(define_int_attr avvi4i4i8     [(UNSPEC_MMA_PMXVI4GER8PP       "pmxvi4ger8pp")])
+
+(define_int_attr vvi4i4i2      [(UNSPEC_MMA_PMXVI16GER2        "pmxvi16ger2")
+                                (UNSPEC_MMA_PMXVI16GER2S       "pmxvi16ger2s")
+                                (UNSPEC_MMA_PMXVF16GER2        "pmxvf16ger2")
+                                (UNSPEC_MMA_PMXVBF16GER2       "pmxvbf16ger2")])
+
+(define_int_attr avvi4i4i2     [(UNSPEC_MMA_PMXVI16GER2PP      "pmxvi16ger2pp")
+                                (UNSPEC_MMA_PMXVI16GER2SPP     "pmxvi16ger2spp")
+                                (UNSPEC_MMA_PMXVF16GER2PP      "pmxvf16ger2pp")
+                                (UNSPEC_MMA_PMXVF16GER2PN      "pmxvf16ger2pn")
+                                (UNSPEC_MMA_PMXVF16GER2NP      "pmxvf16ger2np")
+                                (UNSPEC_MMA_PMXVF16GER2NN      "pmxvf16ger2nn")
+                                (UNSPEC_MMA_PMXVBF16GER2PP     "pmxvbf16ger2pp")
+                                (UNSPEC_MMA_PMXVBF16GER2PN     "pmxvbf16ger2pn")
+                                (UNSPEC_MMA_PMXVBF16GER2NP     "pmxvbf16ger2np")
+                                (UNSPEC_MMA_PMXVBF16GER2NN     "pmxvbf16ger2nn")])
+
+(define_int_attr vvi4i4                [(UNSPEC_MMA_PMXVF32GER         "pmxvf32ger")])
+
+(define_int_attr avvi4i4       [(UNSPEC_MMA_PMXVF32GERPP       "pmxvf32gerpp")
+                                (UNSPEC_MMA_PMXVF32GERPN       "pmxvf32gerpn")
+                                (UNSPEC_MMA_PMXVF32GERNP       "pmxvf32gernp")
+                                (UNSPEC_MMA_PMXVF32GERNN       "pmxvf32gernn")])
+
+(define_int_attr pvi4i2                [(UNSPEC_MMA_PMXVF64GER         "pmxvf64ger")])
+
+(define_int_attr apvi4i2       [(UNSPEC_MMA_PMXVF64GERPP       "pmxvf64gerpp")
+                                (UNSPEC_MMA_PMXVF64GERPN       "pmxvf64gerpn")
+                                (UNSPEC_MMA_PMXVF64GERNP       "pmxvf64gernp")
+                                (UNSPEC_MMA_PMXVF64GERNN       "pmxvf64gernn")])
+
+(define_int_attr vvi4i4i4      [(UNSPEC_MMA_PMXVI8GER4         "pmxvi8ger4")])
+
+(define_int_attr avvi4i4i4     [(UNSPEC_MMA_PMXVI8GER4PP       "pmxvi8ger4pp")
+                                (UNSPEC_MMA_PMXVI8GER4SPP      "pmxvi8ger4spp")])
+
+
 ;; Define a disabled OImode move pattern, so we can use POImode.
 (define_expand "movoi"
   [(set (match_operand:OI 0 "nonimmediate_operand")
 })
 
 (define_insn_and_split "*movpxi"
-  [(set (match_operand:PXI 0 "nonimmediate_operand" "=d,m,d")
-       (match_operand:PXI 1 "input_operand" "m,d,d"))]
+  [(set (match_operand:PXI 0 "nonimmediate_operand" "=d,m,d,d")
+       (match_operand:PXI 1 "input_operand" "m,d,d,O"))]
   "TARGET_MMA
-   && (gpc_reg_operand (operands[0], PXImode)
+   && ((gpc_reg_operand (operands[0], PXImode)
+       && !(CONST_INT_P (operands[1]) && INTVAL (operands[1]) == 0))
        || gpc_reg_operand (operands[1], PXImode))"
   "#"
   "&& reload_completed"
   rs6000_split_multireg_move (operands[0], operands[1]);
   DONE;
 }
-  [(set_attr "type" "vecload,vecstore,veclogical")
-   (set_attr "length" "8,8,16")
-   (set_attr "max_prefixed_insns" "2,2,*")])
+  [(set_attr "type" "vecload,vecstore,veclogical,mma")
+   (set_attr "length" "8,8,16,*")
+   (set_attr "max_prefixed_insns" "2,2,*,*")])
+
+(define_expand "mma_assemble_pair"
+  [(match_operand:POI 0 "vsx_register_operand")
+   (match_operand:V16QI 1 "input_operand")
+   (match_operand:V16QI 2 "input_operand")]
+  "TARGET_MMA"
+{
+  rtx dst;
+
+  /* Let the compiler know the code below fully defines our output value.  */
+  emit_clobber (operands[0]);
+
+  dst = simplify_gen_subreg (V16QImode, operands[0], POImode, 0);
+  emit_move_insn (dst, operands[1]);
+  dst = simplify_gen_subreg (V16QImode, operands[0], POImode, 16);
+  emit_move_insn (dst, operands[2]);
+  DONE;
+})
+
+(define_expand "mma_assemble_acc"
+  [(match_operand:PXI 0 "fpr_reg_operand")
+   (match_operand:V16QI 1 "input_operand")
+   (match_operand:V16QI 2 "input_operand")
+   (match_operand:V16QI 3 "input_operand")
+   (match_operand:V16QI 4 "input_operand")]
+  "TARGET_MMA"
+{
+  rtx src = gen_rtx_UNSPEC (PXImode,
+                           gen_rtvec (4, operands[1], operands[2],
+                                      operands[3], operands[4]),
+                           UNSPEC_MMA_ASSEMBLE_ACC);
+  emit_move_insn (operands[0], src);
+  DONE;
+})
+
+(define_insn_and_split "*mma_assemble_acc"
+  [(set (match_operand:PXI 0 "fpr_reg_operand" "=d")
+       (unspec:PXI [(match_operand:V16QI 1 "mma_assemble_input_operand" "mwa")
+                    (match_operand:V16QI 2 "mma_assemble_input_operand" "mwa")
+                    (match_operand:V16QI 3 "mma_assemble_input_operand" "mwa")
+                    (match_operand:V16QI 4 "mma_assemble_input_operand" "mwa")]
+                    UNSPEC_MMA_ASSEMBLE_ACC))]
+  "TARGET_MMA
+   && fpr_reg_operand (operands[0], PXImode)"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rtx src = gen_rtx_UNSPEC (PXImode,
+                           gen_rtvec (4, operands[1], operands[2],
+                                      operands[3], operands[4]),
+                           UNSPEC_MMA_ASSEMBLE_ACC);
+  rs6000_split_multireg_move (operands[0], src);
+  DONE;
+})
+
+;; MMA instructions that do not use their accumulators as an input, still
+;; must not allow their vector operands to overlap the registers used by
+;; the accumulator.  We enforce this by marking the output as early clobber.
+
+(define_insn "mma_<acc>"
+  [(set (match_operand:PXI 0 "fpr_reg_operand" "=&d")
+       (unspec:PXI [(match_operand:PXI 1 "fpr_reg_operand" "0")]
+                   MMA_ACC))]
+  "TARGET_MMA"
+  "<acc> %A0"
+  [(set_attr "type" "mma")])
+
+(define_insn "mma_xxsetaccz"
+  [(set (match_operand:PXI 0 "fpr_reg_operand" "=d")
+       (const_int 0))]
+  "TARGET_MMA"
+  "xxsetaccz %A0"
+  [(set_attr "type" "mma")])
+
+(define_insn "mma_<vv>"
+  [(set (match_operand:PXI 0 "fpr_reg_operand" "=&d")
+       (unspec:PXI [(match_operand:V16QI 1 "vsx_register_operand" "wa")
+                    (match_operand:V16QI 2 "vsx_register_operand" "wa")]
+                    MMA_VV))]
+  "TARGET_MMA"
+  "<vv> %A0,%x1,%x2"
+  [(set_attr "type" "mma")])
+
+(define_insn "mma_<avv>"
+  [(set (match_operand:PXI 0 "fpr_reg_operand" "=&d")
+       (unspec:PXI [(match_operand:PXI 1 "fpr_reg_operand" "0")
+                    (match_operand:V16QI 2 "vsx_register_operand" "wa")
+                    (match_operand:V16QI 3 "vsx_register_operand" "wa")]
+                    MMA_AVV))]
+  "TARGET_MMA"
+  "<avv> %A0,%x2,%x3"
+  [(set_attr "type" "mma")])
+
+(define_insn "mma_<pv>"
+  [(set (match_operand:PXI 0 "fpr_reg_operand" "=&d")
+       (unspec:PXI [(match_operand:POI 1 "vsx_register_operand" "wa")
+                    (match_operand:V16QI 2 "vsx_register_operand" "wa")]
+                    MMA_PV))]
+  "TARGET_MMA"
+  "<pv> %A0,%x1,%x2"
+  [(set_attr "type" "mma")])
+
+(define_insn "mma_<apv>"
+  [(set (match_operand:PXI 0 "fpr_reg_operand" "=&d")
+       (unspec:PXI [(match_operand:PXI 1 "fpr_reg_operand" "0")
+                    (match_operand:POI 2 "vsx_register_operand" "wa")
+                    (match_operand:V16QI 3 "vsx_register_operand" "wa")]
+                    MMA_APV))]
+  "TARGET_MMA"
+  "<apv> %A0,%x2,%x3"
+  [(set_attr "type" "mma")])
+
+(define_insn "mma_<vvi4i4i8>"
+  [(set (match_operand:PXI 0 "fpr_reg_operand" "=&d")
+       (unspec:PXI [(match_operand:V16QI 1 "vsx_register_operand" "wa")
+                    (match_operand:V16QI 2 "vsx_register_operand" "wa")
+                    (match_operand:SI 3 "const_0_to_15_operand" "n")
+                    (match_operand:SI 4 "const_0_to_15_operand" "n")
+                    (match_operand:SI 5 "u8bit_cint_operand" "n")]
+                    MMA_VVI4I4I8))]
+  "TARGET_MMA"
+  "<vvi4i4i8> %A0,%x1,%x2,%3,%4,%5"
+  [(set_attr "type" "mma")
+   (set_attr "length" "8")])
+
+(define_insn "mma_<avvi4i4i8>"
+  [(set (match_operand:PXI 0 "fpr_reg_operand" "=&d")
+       (unspec:PXI [(match_operand:PXI 1 "fpr_reg_operand" "0")
+                    (match_operand:V16QI 2 "vsx_register_operand" "wa")
+                    (match_operand:V16QI 3 "vsx_register_operand" "wa")
+                    (match_operand:SI 4 "const_0_to_15_operand" "n")
+                    (match_operand:SI 5 "const_0_to_15_operand" "n")
+                    (match_operand:SI 6 "u8bit_cint_operand" "n")]
+                    MMA_AVVI4I4I8))]
+  "TARGET_MMA"
+  "<avvi4i4i8> %A0,%x2,%x3,%4,%5,%6"
+  [(set_attr "type" "mma")
+   (set_attr "length" "8")])
+
+(define_insn "mma_<vvi4i4i2>"
+  [(set (match_operand:PXI 0 "fpr_reg_operand" "=&d")
+       (unspec:PXI [(match_operand:V16QI 1 "vsx_register_operand" "wa")
+                    (match_operand:V16QI 2 "vsx_register_operand" "wa")
+                    (match_operand:SI 3 "const_0_to_15_operand" "n")
+                    (match_operand:SI 4 "const_0_to_15_operand" "n")
+                    (match_operand:SI 5 "const_0_to_3_operand" "n")]
+                    MMA_VVI4I4I2))]
+  "TARGET_MMA"
+  "<vvi4i4i2> %A0,%x1,%x2,%3,%4,%5"
+  [(set_attr "type" "mma")
+   (set_attr "length" "8")])
+
+(define_insn "mma_<avvi4i4i2>"
+  [(set (match_operand:PXI 0 "fpr_reg_operand" "=&d")
+       (unspec:PXI [(match_operand:PXI 1 "fpr_reg_operand" "0")
+                    (match_operand:V16QI 2 "vsx_register_operand" "wa")
+                    (match_operand:V16QI 3 "vsx_register_operand" "wa")
+                    (match_operand:SI 4 "const_0_to_15_operand" "n")
+                    (match_operand:SI 5 "const_0_to_15_operand" "n")
+                    (match_operand:SI 6 "const_0_to_3_operand" "n")]
+                    MMA_AVVI4I4I2))]
+  "TARGET_MMA"
+  "<avvi4i4i2> %A0,%x2,%x3,%4,%5,%6"
+  [(set_attr "type" "mma")
+   (set_attr "length" "8")])
+
+(define_insn "mma_<vvi4i4>"
+  [(set (match_operand:PXI 0 "fpr_reg_operand" "=&d")
+       (unspec:PXI [(match_operand:V16QI 1 "vsx_register_operand" "wa")
+                    (match_operand:V16QI 2 "vsx_register_operand" "wa")
+                    (match_operand:SI 3 "const_0_to_15_operand" "n")
+                    (match_operand:SI 4 "const_0_to_15_operand" "n")]
+                    MMA_VVI4I4))]
+  "TARGET_MMA"
+  "<vvi4i4> %A0,%x1,%x2,%3,%4"
+  [(set_attr "type" "mma")
+   (set_attr "length" "8")])
+
+(define_insn "mma_<avvi4i4>"
+  [(set (match_operand:PXI 0 "fpr_reg_operand" "=&d")
+       (unspec:PXI [(match_operand:PXI 1 "fpr_reg_operand" "0")
+                    (match_operand:V16QI 2 "vsx_register_operand" "wa")
+                    (match_operand:V16QI 3 "vsx_register_operand" "wa")
+                    (match_operand:SI 4 "const_0_to_15_operand" "n")
+                    (match_operand:SI 5 "const_0_to_15_operand" "n")]
+                    MMA_AVVI4I4))]
+  "TARGET_MMA"
+  "<avvi4i4> %A0,%x2,%x3,%4,%5"
+  [(set_attr "type" "mma")
+   (set_attr "length" "8")])
+
+(define_insn "mma_<pvi4i2>"
+  [(set (match_operand:PXI 0 "fpr_reg_operand" "=&d")
+       (unspec:PXI [(match_operand:POI 1 "vsx_register_operand" "wa")
+                    (match_operand:V16QI 2 "vsx_register_operand" "wa")
+                    (match_operand:SI 3 "const_0_to_15_operand" "n")
+                    (match_operand:SI 4 "const_0_to_3_operand" "n")]
+                    MMA_PVI4I2))]
+  "TARGET_MMA"
+  "<pvi4i2> %A0,%x1,%x2,%3,%4"
+  [(set_attr "type" "mma")
+   (set_attr "length" "8")])
+
+(define_insn "mma_<apvi4i2>"
+  [(set (match_operand:PXI 0 "fpr_reg_operand" "=&d")
+       (unspec:PXI [(match_operand:PXI 1 "fpr_reg_operand" "0")
+                    (match_operand:POI 2 "vsx_register_operand" "wa")
+                    (match_operand:V16QI 3 "vsx_register_operand" "wa")
+                    (match_operand:SI 4 "const_0_to_15_operand" "n")
+                    (match_operand:SI 5 "const_0_to_3_operand" "n")]
+                    MMA_APVI4I2))]
+  "TARGET_MMA"
+  "<apvi4i2> %A0,%x2,%x3,%4,%5"
+  [(set_attr "type" "mma")
+   (set_attr "length" "8")])
+
+(define_insn "mma_<vvi4i4i4>"
+  [(set (match_operand:PXI 0 "fpr_reg_operand" "=&d")
+       (unspec:PXI [(match_operand:V16QI 1 "vsx_register_operand" "wa")
+                    (match_operand:V16QI 2 "vsx_register_operand" "wa")
+                    (match_operand:SI 3 "const_0_to_15_operand" "n")
+                    (match_operand:SI 4 "const_0_to_15_operand" "n")
+                    (match_operand:SI 5 "const_0_to_15_operand" "n")]
+                    MMA_VVI4I4I4))]
+  "TARGET_MMA"
+  "<vvi4i4i4> %A0,%x1,%x2,%3,%4,%5"
+  [(set_attr "type" "mma")
+   (set_attr "length" "8")])
+
+(define_insn "mma_<avvi4i4i4>"
+  [(set (match_operand:PXI 0 "fpr_reg_operand" "=&d")
+       (unspec:PXI [(match_operand:PXI 1 "fpr_reg_operand" "0")
+                    (match_operand:V16QI 2 "vsx_register_operand" "wa")
+                    (match_operand:V16QI 3 "vsx_register_operand" "wa")
+                    (match_operand:SI 4 "const_0_to_15_operand" "n")
+                    (match_operand:SI 5 "const_0_to_15_operand" "n")
+                    (match_operand:SI 6 "const_0_to_15_operand" "n")]
+                    MMA_AVVI4I4I4))]
+  "TARGET_MMA"
+  "<avvi4i4i4> %A0,%x2,%x3,%4,%5,%6"
+  [(set_attr "type" "mma")
+   (set_attr "length" "8")])
index c3f460f..9762855 100644 (file)
   return gpc_reg_operand (op, mode);
 })
 
+;; Return 1 if this operand is valid for a MMA assemble accumulator insn.
+(define_special_predicate "mma_assemble_input_operand"
+  (match_test "(mode == V16QImode
+               && (vsx_register_operand (op, mode) || MEM_P (op)))"))
+
 ;; Return true if operand is an operator used in rotate-and-mask instructions.
 (define_predicate "rotate_mask_operator"
   (match_code "rotate,ashift,lshiftrt"))
index 8b1ddb0..968c46c 100644 (file)
@@ -32,6 +32,7 @@
    RS6000_BUILTIN_A -- ABS builtins
    RS6000_BUILTIN_D -- DST builtins
    RS6000_BUILTIN_H -- HTM builtins
+   RS6000_BUILTIN_M -- MMA builtins
    RS6000_BUILTIN_P -- Altivec, VSX, ISA 2.07 vector predicate builtins
    RS6000_BUILTIN_X -- special builtins
 
   #error "RS6000_BUILTIN_H is not defined."
 #endif
 
+#ifndef RS6000_BUILTIN_M
+  #error "RS6000_BUILTIN_M is not defined."
+#endif
+
 #ifndef RS6000_BUILTIN_P
   #error "RS6000_BUILTIN_P is not defined."
 #endif
                     | RS6000_BTC_SPECIAL),                             \
                    CODE_FOR_nothing)                   /* ICODE */
 
+/* MMA convenience macros.  */
+
+#define BU_MMA_1(ENUM, NAME, ATTR, ICODE)                              \
+  RS6000_BUILTIN_M (MMA_BUILTIN_ ## ENUM,              /* ENUM */      \
+                   "__builtin_mma_" NAME,              /* NAME */      \
+                   RS6000_BTM_MMA,                     /* MASK */      \
+                   (RS6000_BTC_ ## ATTR                /* ATTR */      \
+                    | RS6000_BTC_UNARY                                 \
+                    | RS6000_BTC_VOID                                  \
+                    | RS6000_BTC_GIMPLE),                              \
+                   CODE_FOR_nothing)                   /* ICODE */     \
+  RS6000_BUILTIN_M (MMA_BUILTIN_ ## ENUM ## _INTERNAL, /* ENUM */      \
+                   "__builtin_mma_" NAME "_internal",  /* NAME */      \
+                   RS6000_BTM_MMA,                     /* MASK */      \
+                   (RS6000_BTC_ ## ATTR                /* ATTR */      \
+                    | RS6000_BTC_UNARY),                               \
+                   CODE_FOR_ ## ICODE)                 /* ICODE */
+
+#define BU_MMA_V2(ENUM, NAME, ATTR, ICODE)                             \
+  RS6000_BUILTIN_M (MMA_BUILTIN_ ## ENUM,              /* ENUM */      \
+                   "__builtin_mma_" NAME,              /* NAME */      \
+                   RS6000_BTM_MMA,                     /* MASK */      \
+                   (RS6000_BTC_ ## ATTR                /* ATTR */      \
+                    | RS6000_BTC_BINARY                                \
+                    | RS6000_BTC_VOID                                  \
+                    | RS6000_BTC_GIMPLE),                              \
+                   CODE_FOR_nothing)                   /* ICODE */
+
+#define BU_MMA_3(ENUM, NAME, ATTR, ICODE)                              \
+  RS6000_BUILTIN_M (MMA_BUILTIN_ ## ENUM,              /* ENUM */      \
+                   "__builtin_mma_" NAME,              /* NAME */      \
+                   RS6000_BTM_MMA,                     /* MASK */      \
+                   (RS6000_BTC_ ## ATTR                /* ATTR */      \
+                    | RS6000_BTC_TERNARY                               \
+                    | RS6000_BTC_VOID                                  \
+                    | RS6000_BTC_GIMPLE),                              \
+                   CODE_FOR_nothing)                   /* ICODE */     \
+  RS6000_BUILTIN_M (MMA_BUILTIN_ ## ENUM ## _INTERNAL, /* ENUM */      \
+                   "__builtin_mma_" NAME "_internal",  /* NAME */      \
+                   RS6000_BTM_MMA,                     /* MASK */      \
+                   (RS6000_BTC_ ## ATTR                /* ATTR */      \
+                    | RS6000_BTC_TERNARY),                             \
+                   CODE_FOR_ ## ICODE)                 /* ICODE */
+
+#define BU_MMA_5(ENUM, NAME, ATTR, ICODE)                              \
+  RS6000_BUILTIN_M (MMA_BUILTIN_ ## ENUM,              /* ENUM */      \
+                   "__builtin_mma_" NAME,              /* NAME */      \
+                   RS6000_BTM_MMA,                     /* MASK */      \
+                   (RS6000_BTC_ ## ATTR                /* ATTR */      \
+                    | RS6000_BTC_QUINARY                               \
+                    | RS6000_BTC_VOID                                  \
+                    | RS6000_BTC_GIMPLE),                              \
+                   CODE_FOR_nothing)                   /* ICODE */     \
+  RS6000_BUILTIN_M (MMA_BUILTIN_ ## ENUM ## _INTERNAL, /* ENUM */      \
+                   "__builtin_mma_" NAME "_internal",  /* NAME */      \
+                   RS6000_BTM_MMA,                     /* MASK */      \
+                   (RS6000_BTC_ ## ATTR                /* ATTR */      \
+                    | RS6000_BTC_QUINARY),                             \
+                   CODE_FOR_ ## ICODE)                 /* ICODE */
+
+#define BU_MMA_6(ENUM, NAME, ATTR, ICODE)                              \
+  RS6000_BUILTIN_M (MMA_BUILTIN_ ## ENUM,              /* ENUM */      \
+                   "__builtin_mma_" NAME,              /* NAME */      \
+                   RS6000_BTM_MMA,                     /* MASK */      \
+                   (RS6000_BTC_ ## ATTR                /* ATTR */      \
+                    | RS6000_BTC_SENARY                                \
+                    | RS6000_BTC_VOID                                  \
+                    | RS6000_BTC_GIMPLE),                              \
+                   CODE_FOR_nothing)                   /* ICODE */     \
+  RS6000_BUILTIN_M (MMA_BUILTIN_ ## ENUM ## _INTERNAL, /* ENUM */      \
+                   "__builtin_mma_" NAME "_internal",  /* NAME */      \
+                   RS6000_BTM_MMA,                     /* MASK */      \
+                   (RS6000_BTC_ ## ATTR                /* ATTR */      \
+                    | RS6000_BTC_SENARY),                              \
+                   CODE_FOR_ ## ICODE)                 /* ICODE */
+
 /* ISA 2.05 (power6) convenience macros. */
 /* For functions that depend on the CMPB instruction */
 #define BU_P6_2(ENUM, NAME, ATTR, ICODE)                               \
@@ -2785,3 +2866,77 @@ BU_SPECIAL_X (RS6000_BUILTIN_CPU_SUPPORTS, "__builtin_cpu_supports",
 /* Darwin CfString builtin.  */
 BU_SPECIAL_X (RS6000_BUILTIN_CFSTRING, "__builtin_cfstring", RS6000_BTM_ALWAYS,
              RS6000_BTC_MISC)
+
+/* FUTURE MMA builtins.  */
+BU_VSX_1 (XVCVBF16SP,      "xvcvbf16sp",       MISC, vsx_xvcvbf16sp)
+BU_VSX_1 (XVCVSPBF16,      "xvcvspbf16",       MISC, vsx_xvcvspbf16)
+
+BU_MMA_1 (XXMFACC,         "xxmfacc",          QUAD, mma_xxmfacc)
+BU_MMA_1 (XXMTACC,         "xxmtacc",          QUAD, mma_xxmtacc)
+BU_MMA_1 (XXSETACCZ,       "xxsetaccz",        MISC, mma_xxsetaccz)
+
+BU_MMA_V2 (DISASSEMBLE_ACC, "disassemble_acc",  QUAD, nothing)
+BU_MMA_V2 (DISASSEMBLE_PAIR,"disassemble_pair", PAIR, nothing)
+
+BU_MMA_3 (ASSEMBLE_PAIR,    "assemble_pair",   MISC, mma_assemble_pair)
+BU_MMA_3 (XVBF16GER2,      "xvbf16ger2",       MISC, mma_xvbf16ger2)
+BU_MMA_3 (XVF16GER2,       "xvf16ger2",        MISC, mma_xvf16ger2)
+BU_MMA_3 (XVF32GER,        "xvf32ger",         MISC, mma_xvf32ger)
+BU_MMA_3 (XVF64GER,        "xvf64ger",         PAIR, mma_xvf64ger)
+BU_MMA_3 (XVI4GER8,        "xvi4ger8",         MISC, mma_xvi4ger8)
+BU_MMA_3 (XVI8GER4,        "xvi8ger4",         MISC, mma_xvi8ger4)
+BU_MMA_3 (XVI16GER2,       "xvi16ger2",        MISC, mma_xvi16ger2)
+BU_MMA_3 (XVI16GER2S,      "xvi16ger2s",       MISC, mma_xvi16ger2s)
+BU_MMA_3 (XVBF16GER2NN,            "xvbf16ger2nn",     QUAD, mma_xvbf16ger2nn)
+BU_MMA_3 (XVBF16GER2NP,            "xvbf16ger2np",     QUAD, mma_xvbf16ger2np)
+BU_MMA_3 (XVBF16GER2PN,            "xvbf16ger2pn",     QUAD, mma_xvbf16ger2pn)
+BU_MMA_3 (XVBF16GER2PP,            "xvbf16ger2pp",     QUAD, mma_xvbf16ger2pp)
+BU_MMA_3 (XVF16GER2NN,     "xvf16ger2nn",      QUAD, mma_xvf16ger2nn)
+BU_MMA_3 (XVF16GER2NP,     "xvf16ger2np",      QUAD, mma_xvf16ger2np)
+BU_MMA_3 (XVF16GER2PN,     "xvf16ger2pn",      QUAD, mma_xvf16ger2pn)
+BU_MMA_3 (XVF16GER2PP,     "xvf16ger2pp",      QUAD, mma_xvf16ger2pp)
+BU_MMA_3 (XVF32GERNN,      "xvf32gernn",       QUAD, mma_xvf32gernn)
+BU_MMA_3 (XVF32GERNP,      "xvf32gernp",       QUAD, mma_xvf32gernp)
+BU_MMA_3 (XVF32GERPN,      "xvf32gerpn",       QUAD, mma_xvf32gerpn)
+BU_MMA_3 (XVF32GERPP,      "xvf32gerpp",       QUAD, mma_xvf32gerpp)
+BU_MMA_3 (XVF64GERNN,      "xvf64gernn",       QUADPAIR, mma_xvf64gernn)
+BU_MMA_3 (XVF64GERNP,      "xvf64gernp",       QUADPAIR, mma_xvf64gernp)
+BU_MMA_3 (XVF64GERPN,      "xvf64gerpn",       QUADPAIR, mma_xvf64gerpn)
+BU_MMA_3 (XVF64GERPP,      "xvf64gerpp",       QUADPAIR, mma_xvf64gerpp)
+BU_MMA_3 (XVI4GER8PP,      "xvi4ger8pp",       QUAD, mma_xvi4ger8pp)
+BU_MMA_3 (XVI8GER4PP,      "xvi8ger4pp",       QUAD, mma_xvi8ger4pp)
+BU_MMA_3 (XVI8GER4SPP,     "xvi8ger4spp",      QUAD, mma_xvi8ger4spp)
+BU_MMA_3 (XVI16GER2PP,     "xvi16ger2pp",      QUAD, mma_xvi16ger2pp)
+BU_MMA_3 (XVI16GER2SPP,            "xvi16ger2spp",     QUAD, mma_xvi16ger2spp)
+
+BU_MMA_5 (ASSEMBLE_ACC,     "assemble_acc",    MISC, mma_assemble_acc)
+BU_MMA_5 (PMXVF32GER,      "pmxvf32ger",       MISC, mma_pmxvf32ger)
+BU_MMA_5 (PMXVF64GER,      "pmxvf64ger",       PAIR, mma_pmxvf64ger)
+BU_MMA_5 (PMXVF32GERNN,            "pmxvf32gernn",     QUAD, mma_pmxvf32gernn)
+BU_MMA_5 (PMXVF32GERNP,            "pmxvf32gernp",     QUAD, mma_pmxvf32gernp)
+BU_MMA_5 (PMXVF32GERPN,            "pmxvf32gerpn",     QUAD, mma_pmxvf32gerpn)
+BU_MMA_5 (PMXVF32GERPP,            "pmxvf32gerpp",     QUAD, mma_pmxvf32gerpp)
+BU_MMA_5 (PMXVF64GERNN,            "pmxvf64gernn",     QUADPAIR, mma_pmxvf64gernn)
+BU_MMA_5 (PMXVF64GERNP,            "pmxvf64gernp",     QUADPAIR, mma_pmxvf64gernp)
+BU_MMA_5 (PMXVF64GERPN,            "pmxvf64gerpn",     QUADPAIR, mma_pmxvf64gerpn)
+BU_MMA_5 (PMXVF64GERPP,            "pmxvf64gerpp",     QUADPAIR, mma_pmxvf64gerpp)
+
+BU_MMA_6 (PMXVBF16GER2,            "pmxvbf16ger2",     MISC, mma_pmxvbf16ger2)
+BU_MMA_6 (PMXVF16GER2,     "pmxvf16ger2",      MISC, mma_pmxvf16ger2)
+BU_MMA_6 (PMXVI4GER8,      "pmxvi4ger8",       MISC, mma_pmxvi4ger8)
+BU_MMA_6 (PMXVI8GER4,      "pmxvi8ger4",       MISC, mma_pmxvi8ger4)
+BU_MMA_6 (PMXVI16GER2,     "pmxvi16ger2",      MISC, mma_pmxvi16ger2)
+BU_MMA_6 (PMXVI16GER2S,            "pmxvi16ger2s",     MISC, mma_pmxvi16ger2s)
+BU_MMA_6 (PMXVBF16GER2NN,   "pmxvbf16ger2nn",   QUAD, mma_pmxvbf16ger2nn)
+BU_MMA_6 (PMXVBF16GER2NP,   "pmxvbf16ger2np",   QUAD, mma_pmxvbf16ger2np)
+BU_MMA_6 (PMXVBF16GER2PN,   "pmxvbf16ger2pn",   QUAD, mma_pmxvbf16ger2pn)
+BU_MMA_6 (PMXVBF16GER2PP,   "pmxvbf16ger2pp",   QUAD, mma_pmxvbf16ger2pp)
+BU_MMA_6 (PMXVF16GER2NN,    "pmxvf16ger2nn",    QUAD, mma_pmxvf16ger2nn)
+BU_MMA_6 (PMXVF16GER2NP,    "pmxvf16ger2np",    QUAD, mma_pmxvf16ger2np)
+BU_MMA_6 (PMXVF16GER2PN,    "pmxvf16ger2pn",    QUAD, mma_pmxvf16ger2pn)
+BU_MMA_6 (PMXVF16GER2PP,    "pmxvf16ger2pp",    QUAD, mma_pmxvf16ger2pp)
+BU_MMA_6 (PMXVI4GER8PP,            "pmxvi4ger8pp",     QUAD, mma_pmxvi4ger8pp)
+BU_MMA_6 (PMXVI8GER4PP,            "pmxvi8ger4pp",     QUAD, mma_pmxvi8ger4pp)
+BU_MMA_6 (PMXVI8GER4SPP,    "pmxvi8ger4spp",   QUAD, mma_pmxvi8ger4spp)
+BU_MMA_6 (PMXVI16GER2PP,    "pmxvi16ger2pp",    QUAD, mma_pmxvi16ger2pp)
+BU_MMA_6 (PMXVI16GER2SPP,   "pmxvi16ger2spp",   QUAD, mma_pmxvi16ger2spp)
index 088264b..fdf136f 100644 (file)
@@ -183,6 +183,7 @@ static tree builtin_function_type (machine_mode, machine_mode,
                                   enum rs6000_builtins, const char *name);
 static void rs6000_common_init_builtins (void);
 static void htm_init_builtins (void);
+static void mma_init_builtins (void);
 
 
 /* Hash table to keep track of the argument types for builtin functions.  */
@@ -243,6 +244,7 @@ builtin_hasher::equal (builtin_hash_struct *p1, builtin_hash_struct *p2)
 #undef RS6000_BUILTIN_A
 #undef RS6000_BUILTIN_D
 #undef RS6000_BUILTIN_H
+#undef RS6000_BUILTIN_M
 #undef RS6000_BUILTIN_P
 #undef RS6000_BUILTIN_X
 
@@ -270,6 +272,9 @@ builtin_hasher::equal (builtin_hash_struct *p1, builtin_hash_struct *p2)
 #define RS6000_BUILTIN_H(ENUM, NAME, MASK, ATTR, ICODE)  \
   { NAME, ICODE, MASK, ATTR },
 
+#define RS6000_BUILTIN_M(ENUM, NAME, MASK, ATTR, ICODE)  \
+  { NAME, ICODE, MASK, ATTR },
+
 #define RS6000_BUILTIN_P(ENUM, NAME, MASK, ATTR, ICODE)  \
   { NAME, ICODE, MASK, ATTR },
 
@@ -296,6 +301,7 @@ static const struct rs6000_builtin_info_type rs6000_builtin_info[] =
 #undef RS6000_BUILTIN_A
 #undef RS6000_BUILTIN_D
 #undef RS6000_BUILTIN_H
+#undef RS6000_BUILTIN_M
 #undef RS6000_BUILTIN_P
 #undef RS6000_BUILTIN_X
 
@@ -8354,6 +8360,9 @@ def_builtin (const char *name, tree type, enum rs6000_builtins code)
          attr_string = ", fp, const";
        }
     }
+  else if ((classify & (RS6000_BTC_QUAD | RS6000_BTC_PAIR)) != 0)
+    /* The function uses a register quad and/or pair.  Nothing to do.  */
+    ;
   else if ((classify & RS6000_BTC_ATTR_MASK) != 0)
     gcc_unreachable ();
 
@@ -8372,6 +8381,7 @@ def_builtin (const char *name, tree type, enum rs6000_builtins code)
 #undef RS6000_BUILTIN_A
 #undef RS6000_BUILTIN_D
 #undef RS6000_BUILTIN_H
+#undef RS6000_BUILTIN_M
 #undef RS6000_BUILTIN_P
 #undef RS6000_BUILTIN_X
 
@@ -8385,6 +8395,7 @@ def_builtin (const char *name, tree type, enum rs6000_builtins code)
 #define RS6000_BUILTIN_A(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_D(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_H(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_M(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_P(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_X(ENUM, NAME, MASK, ATTR, ICODE)
 
@@ -8403,6 +8414,7 @@ static const struct builtin_description bdesc_3arg[] =
 #undef RS6000_BUILTIN_A
 #undef RS6000_BUILTIN_D
 #undef RS6000_BUILTIN_H
+#undef RS6000_BUILTIN_M
 #undef RS6000_BUILTIN_P
 #undef RS6000_BUILTIN_X
 
@@ -8416,6 +8428,7 @@ static const struct builtin_description bdesc_3arg[] =
 #define RS6000_BUILTIN_A(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_D(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_H(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_M(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_P(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_X(ENUM, NAME, MASK, ATTR, ICODE)
 
@@ -8434,6 +8447,7 @@ static const struct builtin_description bdesc_4arg[] =
 #undef RS6000_BUILTIN_A
 #undef RS6000_BUILTIN_D
 #undef RS6000_BUILTIN_H
+#undef RS6000_BUILTIN_M
 #undef RS6000_BUILTIN_P
 #undef RS6000_BUILTIN_X
 
@@ -8447,6 +8461,7 @@ static const struct builtin_description bdesc_4arg[] =
   { MASK, ICODE, NAME, ENUM },
 
 #define RS6000_BUILTIN_H(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_M(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_P(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_X(ENUM, NAME, MASK, ATTR, ICODE)
 
@@ -8465,6 +8480,7 @@ static const struct builtin_description bdesc_dst[] =
 #undef RS6000_BUILTIN_A
 #undef RS6000_BUILTIN_D
 #undef RS6000_BUILTIN_H
+#undef RS6000_BUILTIN_M
 #undef RS6000_BUILTIN_P
 #undef RS6000_BUILTIN_X
 
@@ -8478,6 +8494,7 @@ static const struct builtin_description bdesc_dst[] =
 #define RS6000_BUILTIN_A(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_D(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_H(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_M(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_P(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_X(ENUM, NAME, MASK, ATTR, ICODE)
 
@@ -8494,6 +8511,7 @@ static const struct builtin_description bdesc_2arg[] =
 #undef RS6000_BUILTIN_A
 #undef RS6000_BUILTIN_D
 #undef RS6000_BUILTIN_H
+#undef RS6000_BUILTIN_M
 #undef RS6000_BUILTIN_P
 #undef RS6000_BUILTIN_X
 
@@ -8505,6 +8523,7 @@ static const struct builtin_description bdesc_2arg[] =
 #define RS6000_BUILTIN_A(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_D(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_H(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_M(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_P(ENUM, NAME, MASK, ATTR, ICODE) \
   { MASK, ICODE, NAME, ENUM },
 
@@ -8527,6 +8546,7 @@ static const struct builtin_description bdesc_altivec_preds[] =
 #undef RS6000_BUILTIN_A
 #undef RS6000_BUILTIN_D
 #undef RS6000_BUILTIN_H
+#undef RS6000_BUILTIN_M
 #undef RS6000_BUILTIN_P
 #undef RS6000_BUILTIN_X
 
@@ -8540,6 +8560,7 @@ static const struct builtin_description bdesc_altivec_preds[] =
 
 #define RS6000_BUILTIN_D(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_H(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_M(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_P(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_X(ENUM, NAME, MASK, ATTR, ICODE)
 
@@ -8559,6 +8580,7 @@ static const struct builtin_description bdesc_abs[] =
 #undef RS6000_BUILTIN_A
 #undef RS6000_BUILTIN_D
 #undef RS6000_BUILTIN_H
+#undef RS6000_BUILTIN_M
 #undef RS6000_BUILTIN_P
 #undef RS6000_BUILTIN_X
 
@@ -8572,6 +8594,7 @@ static const struct builtin_description bdesc_abs[] =
 #define RS6000_BUILTIN_A(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_D(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_H(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_M(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_P(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_X(ENUM, NAME, MASK, ATTR, ICODE)
 
@@ -8590,6 +8613,7 @@ static const struct builtin_description bdesc_1arg[] =
 #undef RS6000_BUILTIN_A
 #undef RS6000_BUILTIN_D
 #undef RS6000_BUILTIN_H
+#undef RS6000_BUILTIN_M
 #undef RS6000_BUILTIN_P
 #undef RS6000_BUILTIN_X
 
@@ -8603,6 +8627,7 @@ static const struct builtin_description bdesc_1arg[] =
 #define RS6000_BUILTIN_A(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_D(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_H(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_M(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_P(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_X(ENUM, NAME, MASK, ATTR, ICODE)
 
@@ -8620,6 +8645,7 @@ static const struct builtin_description bdesc_0arg[] =
 #undef RS6000_BUILTIN_A
 #undef RS6000_BUILTIN_D
 #undef RS6000_BUILTIN_H
+#undef RS6000_BUILTIN_M
 #undef RS6000_BUILTIN_P
 #undef RS6000_BUILTIN_X
 
@@ -8633,6 +8659,7 @@ static const struct builtin_description bdesc_0arg[] =
 #define RS6000_BUILTIN_H(ENUM, NAME, MASK, ATTR, ICODE) \
   { MASK, ICODE, NAME, ENUM },
 
+#define RS6000_BUILTIN_M(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_P(ENUM, NAME, MASK, ATTR, ICODE)
 #define RS6000_BUILTIN_X(ENUM, NAME, MASK, ATTR, ICODE)
 
@@ -8641,6 +8668,7 @@ static const struct builtin_description bdesc_htm[] =
 #include "rs6000-builtin.def"
 };
 
+/* MMA builtins.  */
 #undef RS6000_BUILTIN_0
 #undef RS6000_BUILTIN_1
 #undef RS6000_BUILTIN_2
@@ -8649,7 +8677,40 @@ static const struct builtin_description bdesc_htm[] =
 #undef RS6000_BUILTIN_A
 #undef RS6000_BUILTIN_D
 #undef RS6000_BUILTIN_H
+#undef RS6000_BUILTIN_M
 #undef RS6000_BUILTIN_P
+#undef RS6000_BUILTIN_X
+
+#define RS6000_BUILTIN_0(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_1(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_2(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_3(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_4(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_A(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_D(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_H(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_M(ENUM, NAME, MASK, ATTR, ICODE) \
+  { MASK, ICODE, NAME, ENUM },
+
+#define RS6000_BUILTIN_P(ENUM, NAME, MASK, ATTR, ICODE)
+#define RS6000_BUILTIN_X(ENUM, NAME, MASK, ATTR, ICODE)
+
+static const struct builtin_description bdesc_mma[] =
+{
+#include "rs6000-builtin.def"
+};
+
+#undef RS6000_BUILTIN_0
+#undef RS6000_BUILTIN_1
+#undef RS6000_BUILTIN_2
+#undef RS6000_BUILTIN_3
+#undef RS6000_BUILTIN_4
+#undef RS6000_BUILTIN_A
+#undef RS6000_BUILTIN_D
+#undef RS6000_BUILTIN_H
+#undef RS6000_BUILTIN_M
+#undef RS6000_BUILTIN_P
+#undef RS6000_BUILTIN_X
 
 /* Return true if a builtin function is overloaded.  */
 bool
@@ -9393,6 +9454,133 @@ altivec_expand_stv_builtin (enum insn_code icode, tree exp)
   return NULL_RTX;
 }
 
+/* Expand the MMA built-in in EXP.
+   Store true in *EXPANDEDP if we found a built-in to expand.  */
+
+static rtx
+mma_expand_builtin (tree exp, rtx target, bool *expandedp)
+{
+  unsigned i;
+  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
+  enum rs6000_builtins fcode
+    = (enum rs6000_builtins) DECL_MD_FUNCTION_CODE (fndecl);
+  const struct builtin_description *d = bdesc_mma;
+
+  /* Expand the MMA built-in.  */
+  for (i = 0; i < ARRAY_SIZE (bdesc_mma); i++, d++)
+    if (d->code == fcode)
+      break;
+
+  if (i >= ARRAY_SIZE (bdesc_mma))
+    {
+      *expandedp = false;
+      return NULL_RTX;
+    }
+
+  *expandedp = true;
+
+  tree arg;
+  call_expr_arg_iterator iter;
+  enum insn_code icode = d->icode;
+  const struct insn_operand_data *insn_op;
+  rtx op[MAX_MMA_OPERANDS];
+  unsigned nopnds = 0;
+  unsigned attr = rs6000_builtin_info[fcode].attr;
+  bool void_func = (attr & RS6000_BTC_VOID);
+  machine_mode tmode = VOIDmode;
+
+  if (TREE_TYPE (TREE_TYPE (fndecl)) != void_type_node)
+    {
+      tmode = insn_data[icode].operand[0].mode;
+      if (!target
+         || GET_MODE (target) != tmode
+         || !(*insn_data[icode].operand[0].predicate) (target, tmode))
+       target = gen_reg_rtx (tmode);
+      op[nopnds++] = target;
+    }
+  else
+    target = const0_rtx;
+
+  FOR_EACH_CALL_EXPR_ARG (arg, iter, exp)
+    {
+      if (arg == error_mark_node)
+       return const0_rtx;
+
+      rtx opnd;
+      insn_op = &insn_data[icode].operand[nopnds];
+      if (TREE_CODE (arg) == ADDR_EXPR
+         && MEM_P (DECL_RTL (TREE_OPERAND (arg, 0))))
+       opnd = DECL_RTL (TREE_OPERAND (arg, 0));
+      else
+       opnd = expand_normal (arg);
+
+      if (!(*insn_op->predicate) (opnd, insn_op->mode))
+       {
+         if (!strcmp (insn_op->constraint, "n"))
+           {
+             if (!CONST_INT_P (opnd))
+               error ("argument %d must be an unsigned literal", nopnds);
+             else
+               error ("argument %d is an unsigned literal that is "
+                      "out of range", nopnds);
+             return const0_rtx;
+           }
+         opnd = copy_to_mode_reg (insn_op->mode, opnd);
+       }
+
+      /* Some MMA instructions have INOUT accumulator operands, so force
+        their target register to be the same as their input register.  */
+      if (!void_func
+         && nopnds == 1
+         && !strcmp (insn_op->constraint, "0")
+         && insn_op->mode == tmode
+         && REG_P (opnd)
+         && (*insn_data[icode].operand[0].predicate) (opnd, tmode))
+       target = op[0] = opnd;
+
+      op[nopnds++] = opnd;
+    }
+
+  unsigned attr_args = attr & RS6000_BTC_OPND_MASK;
+  if (attr & RS6000_BTC_QUAD)
+    attr_args++;
+
+  gcc_assert (nopnds == attr_args);
+
+  rtx pat;
+  switch (nopnds)
+    {
+    case 1:
+      pat = GEN_FCN (icode) (op[0]);
+      break;
+    case 2:
+      pat = GEN_FCN (icode) (op[0], op[1]);
+      break;
+    case 3:
+      pat = GEN_FCN (icode) (op[0], op[1], op[2]);
+      break;
+    case 4:
+      pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3]);
+      break;
+    case 5:
+      pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4]);
+      break;
+    case 6:
+      pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4], op[5]);
+      break;
+    case 7:
+      pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4], op[5], op[6]);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  if (!pat)
+    return NULL_RTX;
+  emit_insn (pat);
+
+  return target;
+}
+
 /* Return the appropriate SPR number associated with the given builtin.  */
 static inline HOST_WIDE_INT
 htm_spr_num (enum rs6000_builtins code)
@@ -9539,11 +9727,11 @@ htm_expand_builtin (tree exp, rtx target, bool * expandedp)
        if (flag_checking)
          {
            int expected_nopnds = 0;
-           if ((attr & RS6000_BTC_TYPE_MASK) == RS6000_BTC_UNARY)
+           if ((attr & RS6000_BTC_OPND_MASK) == RS6000_BTC_UNARY)
              expected_nopnds = 1;
-           else if ((attr & RS6000_BTC_TYPE_MASK) == RS6000_BTC_BINARY)
+           else if ((attr & RS6000_BTC_OPND_MASK) == RS6000_BTC_BINARY)
              expected_nopnds = 2;
-           else if ((attr & RS6000_BTC_TYPE_MASK) == RS6000_BTC_TERNARY)
+           else if ((attr & RS6000_BTC_OPND_MASK) == RS6000_BTC_TERNARY)
              expected_nopnds = 3;
            else if ((attr & RS6000_BTC_TYPE_MASK) == RS6000_BTC_QUATERNARY)
              expected_nopnds = 4;
@@ -10647,6 +10835,10 @@ rs6000_invalid_builtin (enum rs6000_builtins fncode)
           "-m64");
   else if ((fnmask & RS6000_BTM_P9_MISC) == RS6000_BTM_P9_MISC)
     error ("%qs requires the %qs option", name, "-mcpu=power9");
+  else if ((fnmask & RS6000_BTM_FUTURE) != 0)
+    error ("%qs requires the %qs option", name, "-mcpu=future");
+  else if ((fnmask & RS6000_BTM_MMA) != 0)
+    error ("%qs requires the %qs option", name, "-mmma");
   else if ((fnmask & RS6000_BTM_LDBL128) == RS6000_BTM_LDBL128)
     {
       if (!TARGET_HARD_FLOAT)
@@ -10690,6 +10882,10 @@ rs6000_fold_builtin (tree fndecl ATTRIBUTE_UNUSED,
 static bool
 rs6000_builtin_valid_without_lhs (enum rs6000_builtins fn_code)
 {
+  /* Check for built-ins explicitly marked as a void function.  */
+  if (rs6000_builtin_info[fn_code].attr & RS6000_BTC_VOID)
+    return true;
+
   switch (fn_code)
     {
     case ALTIVEC_BUILTIN_STVX_V16QI:
@@ -10836,6 +11032,156 @@ fold_mergeeo_helper (gimple_stmt_iterator *gsi, gimple *stmt, int use_odd)
   gsi_replace (gsi, g, true);
 }
 
+/* Expand the MMA built-ins early, so that we can convert the pass-by-reference
+   __vector_quad arguments into pass-by-value arguments, leading to more
+   efficient code generation.  */
+
+bool
+rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi)
+{
+  gimple *stmt = gsi_stmt (*gsi);
+  tree fndecl = gimple_call_fndecl (stmt);
+  enum rs6000_builtins fncode
+    = (enum rs6000_builtins) DECL_MD_FUNCTION_CODE (fndecl);
+  unsigned attr = rs6000_builtin_info[fncode].attr;
+
+  if ((attr & RS6000_BTC_GIMPLE) == 0)
+    return false;
+
+  unsigned nopnds = (attr & RS6000_BTC_OPND_MASK);
+  gimple_seq new_seq = NULL;
+  gimple *new_call;
+  tree new_decl;
+
+  if (rs6000_builtin_info[fncode + 1].icode == CODE_FOR_nothing)
+    {
+      /* This is an MMA disassemble built-in function.  */
+      gcc_assert (fncode == MMA_BUILTIN_DISASSEMBLE_ACC
+                 || fncode == MMA_BUILTIN_DISASSEMBLE_PAIR);
+
+      push_gimplify_context (true);
+      tree dst_ptr = gimple_call_arg (stmt, 0);
+      tree src_ptr = gimple_call_arg (stmt, 1);
+      tree src_type = TREE_TYPE (src_ptr);
+      tree src = make_ssa_name (TREE_TYPE (src_type));
+      gimplify_assign (src, build_simple_mem_ref (src_ptr), &new_seq);
+
+      /* If we are not disassembling an accumulator or our destination is
+        another accumulator, then just copy the entire thing as is.  */
+      if (fncode != MMA_BUILTIN_DISASSEMBLE_ACC
+         || TREE_TYPE (TREE_TYPE (dst_ptr)) == vector_quad_type_node)
+       {
+         tree dst = build_simple_mem_ref (build1 (VIEW_CONVERT_EXPR,
+                                                  src_type, dst_ptr));
+         gimplify_assign (dst, src, &new_seq);
+         pop_gimplify_context (NULL);
+         gsi_replace_with_seq (gsi, new_seq, true);
+         return true;
+       }
+
+      /* We're disassembling an accumulator into a different type, so we need
+        to emit a xxmfacc instruction now, since we cannot do it later.  */
+      new_decl = rs6000_builtin_decls[MMA_BUILTIN_XXMFACC_INTERNAL];
+      new_call = gimple_build_call (new_decl, 1, src);
+      src = make_ssa_name (vector_quad_type_node);
+      gimple_call_set_lhs (new_call, src);
+      gimple_seq_add_stmt (&new_seq, new_call);
+
+      /* Copy the accumulator vector by vector.  */
+      tree dst_type = build_pointer_type_for_mode (unsigned_V16QI_type_node,
+                                                  ptr_mode, true);
+      tree dst_base = build1 (VIEW_CONVERT_EXPR, dst_type, dst_ptr);
+      tree array_type = build_array_type_nelts (unsigned_V16QI_type_node, 4);
+      tree src_array = build1 (VIEW_CONVERT_EXPR, array_type, src);
+      for (unsigned i = 0; i < 4; i++)
+       {
+         tree ref = build4 (ARRAY_REF, unsigned_V16QI_type_node, src_array,
+                            build_int_cst (size_type_node, i),
+                            NULL_TREE, NULL_TREE);
+         tree dst = build2 (MEM_REF, unsigned_V16QI_type_node, dst_base,
+                            build_int_cst (dst_type, i * 16));
+         gimplify_assign (dst, ref, &new_seq);
+       }
+      pop_gimplify_context (NULL);
+      gsi_replace_with_seq (gsi, new_seq, true);
+      return true;
+    }
+
+  /* Convert this built-in into an internal version that uses pass-by-value
+     arguments.  The internal built-in follows immediately after this one.  */
+  new_decl = rs6000_builtin_decls[fncode + 1];
+  tree lhs, mem, op[MAX_MMA_OPERANDS];
+  tree acc = gimple_call_arg (stmt, 0);
+  if (TREE_CODE (acc) == PARM_DECL)
+    mem = build1 (INDIRECT_REF, TREE_TYPE (TREE_TYPE (acc)), acc);
+  else
+    mem = build_simple_mem_ref (acc);
+  push_gimplify_context (true);
+
+  if ((attr & RS6000_BTC_QUAD) != 0)
+    {
+      /* This built-in has a pass-by-reference accumulator input, so load it
+        into a temporary accumulator for use as a pass-by-value input.  */
+      op[0] = make_ssa_name (vector_quad_type_node);
+      for (unsigned i = 1; i < nopnds; i++)
+       op[i] = gimple_call_arg (stmt, i);
+      gimplify_assign (op[0], mem, &new_seq);
+    }
+  else
+    {
+      /* This built-in does not use its pass-by-reference accumulator argument
+        as an input argument, so remove it from the input list.  */
+      nopnds--;
+      for (unsigned i = 0; i < nopnds; i++)
+       op[i] = gimple_call_arg (stmt, i + 1);
+    }
+
+  switch (nopnds)
+    {
+    case 0:
+      new_call = gimple_build_call (new_decl, 0);
+      break;
+    case 1:
+      new_call = gimple_build_call (new_decl, 1, op[0]);
+      break;
+    case 2:
+      new_call = gimple_build_call (new_decl, 2, op[0], op[1]);
+      break;
+    case 3:
+      new_call = gimple_build_call (new_decl, 3, op[0], op[1], op[2]);
+      break;
+    case 4:
+      new_call = gimple_build_call (new_decl, 4, op[0], op[1], op[2], op[3]);
+      break;
+    case 5:
+      new_call = gimple_build_call (new_decl, 5, op[0], op[1], op[2], op[3],
+                                   op[4]);
+      break;
+    case 6:
+      new_call = gimple_build_call (new_decl, 6, op[0], op[1], op[2], op[3],
+                                   op[4], op[5]);
+      break;
+    case 7:
+      new_call = gimple_build_call (new_decl, 7, op[0], op[1], op[2], op[3],
+                                   op[4], op[5], op[6]);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  if (fncode == MMA_BUILTIN_ASSEMBLE_PAIR)
+    lhs = make_ssa_name (vector_pair_type_node);
+  else
+    lhs = make_ssa_name (vector_quad_type_node);
+  gimple_call_set_lhs (new_call, lhs);
+  gimple_seq_add_stmt (&new_seq, new_call);
+  gimplify_assign (mem, lhs, &new_seq);
+  pop_gimplify_context (NULL);
+  gsi_replace_with_seq (gsi, new_seq, true);
+
+  return true;
+}
+
 /* Fold a machine-dependent built-in in GIMPLE.  (For folding into
    a constant, use rs6000_fold_builtin.)  */
 
@@ -10871,11 +11217,12 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
     return false;
 
   /* Don't fold invalid builtins, let rs6000_expand_builtin diagnose it.  */
-  HOST_WIDE_INT mask = rs6000_builtin_info[uns_fncode].mask;
-  bool func_valid_p = (rs6000_builtin_mask & mask) == mask;
-  if (!func_valid_p)
+  if (!rs6000_builtin_is_supported_p (fn_code))
     return false;
 
+  if (rs6000_gimple_fold_mma_builtin (gsi))
+    return true;
+
   switch (fn_code)
     {
     /* Flavors of vec_add.  We deliberately don't expand
@@ -12010,6 +12357,13 @@ rs6000_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
       break;
     }
 
+  if (TARGET_MMA)
+    {
+      ret = mma_expand_builtin (exp, target, &success);
+
+      if (success)
+       return ret;
+    }
   if (TARGET_ALTIVEC)
     {
       ret = altivec_expand_builtin (exp, target, &success);
@@ -12025,7 +12379,7 @@ rs6000_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
        return ret;
     }  
 
-  unsigned attr = rs6000_builtin_info[uns_fcode].attr & RS6000_BTC_TYPE_MASK;
+  unsigned attr = rs6000_builtin_info[uns_fcode].attr & RS6000_BTC_OPND_MASK;
   /* RS6000_BTC_SPECIAL represents no-operand operators.  */
   gcc_assert (attr == RS6000_BTC_UNARY
              || attr == RS6000_BTC_BINARY
@@ -12208,7 +12562,7 @@ rs6000_init_builtins (void)
   else
     ieee128_float_type_node = ibm128_float_type_node = long_double_type_node;
 
-  /* Vector paired and vector quad support.  */
+  /* Vector pair and vector quad support.  */
   if (TARGET_MMA)
     {
       tree oi_uns_type = make_unsigned_type (256);
@@ -12290,6 +12644,8 @@ rs6000_init_builtins (void)
      the target attribute.  */
   if (TARGET_EXTRA_BUILTINS)
     altivec_init_builtins ();
+  if (TARGET_MMA)
+    mma_init_builtins ();
   if (TARGET_HTM)
     htm_init_builtins ();
 
@@ -13016,6 +13372,119 @@ altivec_init_builtins (void)
 }
 
 static void
+mma_init_builtins (void)
+{
+  const struct builtin_description *d = bdesc_mma;
+
+  for (unsigned i = 0; i < ARRAY_SIZE (bdesc_mma); i++, d++)
+    {
+      tree op[MAX_MMA_OPERANDS], type;
+      HOST_WIDE_INT mask = d->mask;
+      unsigned icode = (unsigned) d->icode;
+      unsigned attr = rs6000_builtin_info[d->code].attr;
+      int attr_args = (attr & RS6000_BTC_OPND_MASK);
+      bool gimple_func = (attr & RS6000_BTC_GIMPLE);
+      unsigned nopnds = 0;
+
+      if ((mask & rs6000_builtin_mask) != mask)
+       {
+         if (TARGET_DEBUG_BUILTIN)
+           fprintf (stderr, "mma_builtin, skip binary %s\n", d->name);
+         continue;
+       }
+
+      if (d->name == 0)
+       {
+         if (TARGET_DEBUG_BUILTIN)
+           fprintf (stderr, "mma_builtin, bdesc_mma[%ld] no name\n",
+                    (long unsigned) i);
+         continue;
+       }
+
+      if (gimple_func)
+       {
+         gcc_assert (icode == CODE_FOR_nothing);
+         op[nopnds++] = void_type_node;
+         /* Some MMA built-ins that are expanded into gimple are converted
+            into internal MMA built-ins that are expanded into rtl.
+            The internal built-in follows immediately after this built-in.  */
+         icode = d[1].icode;
+       }
+      else
+       {
+         if ((attr & RS6000_BTC_QUAD) == 0)
+           attr_args--;
+
+         /* Ensure we have the correct number and type of operands.  */
+         gcc_assert (attr_args == insn_data[icode].n_operands - 1);
+       }
+
+      if (icode == CODE_FOR_nothing)
+       {
+         /* This is a disassemble MMA built-in function.  */
+         gcc_assert (attr_args == RS6000_BTC_BINARY
+                     && (d->code == MMA_BUILTIN_DISASSEMBLE_ACC
+                         || d->code == MMA_BUILTIN_DISASSEMBLE_PAIR));
+         op[nopnds++] = build_pointer_type (void_type_node);
+         if (attr & RS6000_BTC_QUAD)
+           op[nopnds++] = build_pointer_type (vector_quad_type_node);
+         else
+           op[nopnds++] = build_pointer_type (vector_pair_type_node);
+       }
+      else
+       {
+         /* This is a normal MMA built-in function.  */
+         unsigned j = (attr & RS6000_BTC_QUAD) ? 1 : 0;
+         for (; j < insn_data[icode].n_operands; j++)
+           {
+             machine_mode mode = insn_data[icode].operand[j].mode;
+             if (gimple_func && mode == PXImode)
+               op[nopnds++] = build_pointer_type (vector_quad_type_node);
+             else if (gimple_func && mode == POImode
+                      && d->code == MMA_BUILTIN_ASSEMBLE_PAIR)
+               op[nopnds++] = build_pointer_type (vector_pair_type_node);
+             else
+               /* MMA uses unsigned types.  */
+               op[nopnds++] = builtin_mode_to_type[mode][1];
+           }
+       }
+
+      switch (nopnds)
+       {
+       case 1:
+         type = build_function_type_list (op[0], NULL_TREE);
+         break;
+       case 2:
+         type = build_function_type_list (op[0], op[1], NULL_TREE);
+         break;
+       case 3:
+         type = build_function_type_list (op[0], op[1], op[2], NULL_TREE);
+         break;
+       case 4:
+         type = build_function_type_list (op[0], op[1], op[2], op[3],
+                                          NULL_TREE);
+         break;
+       case 5:
+         type = build_function_type_list (op[0], op[1], op[2], op[3], op[4],
+                                          NULL_TREE);
+         break;
+       case 6:
+         type = build_function_type_list (op[0], op[1], op[2], op[3], op[4],
+                                          op[5], NULL_TREE);
+         break;
+       case 7:
+         type = build_function_type_list (op[0], op[1], op[2], op[3], op[4],
+                                          op[5], op[6], NULL_TREE);
+         break;
+       default:
+         gcc_unreachable ();
+       }
+
+      def_builtin (d->name, type, d->code);
+    }
+}
+
+static void
 htm_init_builtins (void)
 {
   HOST_WIDE_INT builtin_mask = rs6000_builtin_mask;
@@ -13029,7 +13498,7 @@ htm_init_builtins (void)
       HOST_WIDE_INT mask = d->mask;
       unsigned attr = rs6000_builtin_info[d->code].attr;
       bool void_func = (attr & RS6000_BTC_VOID);
-      int attr_args = (attr & RS6000_BTC_TYPE_MASK);
+      int attr_args = (attr & RS6000_BTC_OPND_MASK);
       int nopnds = 0;
       tree gpr_type_node;
       tree rettype;
@@ -13195,6 +13664,8 @@ builtin_function_type (machine_mode mode_ret, machine_mode mode_arg0,
     case P8V_BUILTIN_VGBBD:
     case MISC_BUILTIN_CDTBCD:
     case MISC_BUILTIN_CBCDTD:
+    case VSX_BUILTIN_XVCVSPBF16:
+    case VSX_BUILTIN_XVCVBF16SP:
       h.uns_p[0] = 1;
       h.uns_p[1] = 1;
       break;
index 8f06080..6bc070f 100644 (file)
@@ -9941,7 +9941,7 @@ rs6000_emit_move (rtx dest, rtx source, machine_mode mode)
 
     case E_POImode:
     case E_PXImode:
-      if (CONSTANT_P (operands[1]))
+      if (CONST_INT_P (operands[1]) && INTVAL (operands[1]) != 0)
        error ("%qs is an opaque type, and you can't set it to other values.",
               (mode == POImode) ? "__vector_pair" : "__vector_quad");
       break;
@@ -12853,6 +12853,14 @@ print_operand (FILE *file, rtx x, int code)
       /* %c is output_addr_const if a CONSTANT_ADDRESS_P, otherwise
         output_operand.  */
 
+    case 'A':
+      /* Write the MMA accumulator number associated with VSX register X.  */
+      if (!REG_P (x) || !FP_REGNO_P (REGNO (x)) || (REGNO (x) % 4) != 0)
+       output_operand_lossage ("invalid %%A value");
+      else
+       fprintf (file, "%d", (REGNO (x) - FIRST_FPR_REGNO) / 4);
+      return;
+
     case 'D':
       /* Like 'J' but get to the GT bit only.  */
       if (!REG_P (x) || !CR_REGNO_P (REGNO (x)))
@@ -15963,6 +15971,12 @@ rs6000_split_multireg_move (rtx dst, rtx src)
          unsigned offset = 0;
          unsigned size = GET_MODE_SIZE (reg_mode);
 
+         /* If we are reading an accumulator register, we have to
+            deprime it before we can access it.  */
+         if (TARGET_MMA
+             && GET_MODE (src) == PXImode && FP_REGNO_P (REGNO (src)))
+           emit_insn (gen_mma_xxmfacc (src, src));
+
          for (int i = 0; i < nregs; i++)
            {
              unsigned subreg = (WORDS_BIG_ENDIAN)
@@ -15991,6 +16005,32 @@ rs6000_split_multireg_move (rtx dst, rtx src)
              emit_insn (gen_rtx_SET (dst2, src2));
            }
 
+         /* If we are writing an accumulator register, we have to
+            prime it after we've written it.  */
+         if (TARGET_MMA
+             && GET_MODE (dst) == PXImode && FP_REGNO_P (REGNO (dst)))
+           emit_insn (gen_mma_xxmtacc (dst, dst));
+
+         return;
+       }
+
+      if (GET_CODE (src) == UNSPEC)
+       {
+         gcc_assert (REG_P (dst)
+                     && FP_REGNO_P (REGNO (dst))
+                     && XINT (src, 1) == UNSPEC_MMA_ASSEMBLE_ACC);
+
+         reg_mode = GET_MODE (XVECEXP (src, 0, 0));
+         for (int i = 0; i < XVECLEN (src, 0); i++)
+           {
+             rtx dst_i = gen_rtx_REG (reg_mode, reg + i);
+             emit_insn (gen_rtx_SET (dst_i, XVECEXP (src, 0, i)));
+           }
+
+         /* We are writing an accumulator register, so we have to
+            prime it after we've written it.  */
+         emit_insn (gen_mma_xxmtacc (dst, dst));
+
          return;
        }
 
@@ -15999,6 +16039,12 @@ rs6000_split_multireg_move (rtx dst, rtx src)
 
   if (REG_P (src) && REG_P (dst) && (REGNO (src) < REGNO (dst)))
     {
+      /* If we are reading an accumulator register, we have to
+        deprime it before we can access it.  */
+      if (TARGET_MMA
+         && GET_MODE (src) == PXImode && FP_REGNO_P (REGNO (src)))
+       emit_insn (gen_mma_xxmfacc (src, src));
+
       /* Move register range backwards, if we might have destructive
         overlap.  */
       int i;
@@ -16007,6 +16053,12 @@ rs6000_split_multireg_move (rtx dst, rtx src)
                                                     i * reg_mode_size),
                                simplify_gen_subreg (reg_mode, src, mode,
                                                     i * reg_mode_size)));
+
+      /* If we are writing an accumulator register, we have to
+        prime it after we've written it.  */
+      if (TARGET_MMA
+         && GET_MODE (dst) == PXImode && FP_REGNO_P (REGNO (dst)))
+       emit_insn (gen_mma_xxmtacc (dst, dst));
     }
   else
     {
@@ -16139,6 +16191,12 @@ rs6000_split_multireg_move (rtx dst, rtx src)
            gcc_assert (rs6000_offsettable_memref_p (dst, reg_mode, true));
        }
 
+      /* If we are reading an accumulator register, we have to
+        deprime it before we can access it.  */
+      if (TARGET_MMA && REG_P (src)
+         && GET_MODE (src) == PXImode && FP_REGNO_P (REGNO (src)))
+       emit_insn (gen_mma_xxmfacc (src, src));
+
       for (i = 0; i < nregs; i++)
        {
          /* Calculate index to next subword.  */
@@ -16156,6 +16214,13 @@ rs6000_split_multireg_move (rtx dst, rtx src)
                                  simplify_gen_subreg (reg_mode, src, mode,
                                                       j * reg_mode_size)));
        }
+
+      /* If we are writing an accumulator register, we have to
+        prime it after we've written it.  */
+      if (TARGET_MMA && REG_P (dst)
+         && GET_MODE (dst) == PXImode && FP_REGNO_P (REGNO (dst)))
+       emit_insn (gen_mma_xxmtacc (dst, dst));
+
       if (restore_basereg != NULL_RTX)
        emit_insn (restore_basereg);
     }
index 188f49c..eb22d5e 100644 (file)
@@ -2251,20 +2251,24 @@ extern int frame_pointer_needed;
    flags macros, but we've run out of bits, so we now map the options into new
    settings used here.  */
 
-/* Builtin attributes.  */
-#define RS6000_BTC_SPECIAL     0x00000000      /* Special function.  */
+/* Builtin operand count.  */
 #define RS6000_BTC_UNARY       0x00000001      /* normal unary function.  */
 #define RS6000_BTC_BINARY      0x00000002      /* normal binary function.  */
 #define RS6000_BTC_TERNARY     0x00000003      /* normal ternary function.  */
 #define RS6000_BTC_QUATERNARY  0x00000004      /* normal quaternary
                                                   function. */
+#define RS6000_BTC_QUINARY     0x00000005      /* normal quinary function.  */
+#define RS6000_BTC_SENARY      0x00000006      /* normal senary function.  */
+#define RS6000_BTC_OPND_MASK   0x00000007      /* Mask to isolate operands. */
 
-#define RS6000_BTC_PREDICATE   0x00000005      /* predicate function.  */
-#define RS6000_BTC_ABS         0x00000006      /* Altivec/VSX ABS
+/* Builtin attributes.  */
+#define RS6000_BTC_SPECIAL     0x00000000      /* Special function.  */
+#define RS6000_BTC_PREDICATE   0x00000008      /* predicate function.  */
+#define RS6000_BTC_ABS         0x00000010      /* Altivec/VSX ABS
                                                   function.  */
-#define RS6000_BTC_DST         0x00000007      /* Altivec DST function.  */
+#define RS6000_BTC_DST         0x00000020      /* Altivec DST function.  */
 
-#define RS6000_BTC_TYPE_MASK   0x0000000f      /* Mask to isolate types */
+#define RS6000_BTC_TYPE_MASK   0x0000003f      /* Mask to isolate types */
 
 #define RS6000_BTC_MISC                0x00000000      /* No special attributes.  */
 #define RS6000_BTC_CONST       0x00000100      /* Neither uses, nor
@@ -2273,13 +2277,18 @@ extern int frame_pointer_needed;
                                                   state/mem and does
                                                   not modify global state.  */
 #define RS6000_BTC_FP          0x00000400      /* depends on rounding mode.  */
-#define RS6000_BTC_ATTR_MASK   0x00000700      /* Mask of the attributes.  */
+#define RS6000_BTC_QUAD                0x00000800      /* Uses a register quad.  */
+#define RS6000_BTC_PAIR                0x00001000      /* Uses a register pair.  */
+#define RS6000_BTC_QUADPAIR    0x00001800      /* Uses a quad and a pair.  */
+#define RS6000_BTC_ATTR_MASK   0x00001f00      /* Mask of the attributes.  */
 
 /* Miscellaneous information.  */
 #define RS6000_BTC_SPR         0x01000000      /* function references SPRs.  */
 #define RS6000_BTC_VOID                0x02000000      /* function has no return value.  */
 #define RS6000_BTC_CR          0x04000000      /* function references a CR.  */
 #define RS6000_BTC_OVERLOADED  0x08000000      /* function is overloaded.  */
+#define RS6000_BTC_GIMPLE      0x10000000      /* function should be expanded
+                                                  into gimple.  */
 #define RS6000_BTC_MISC_MASK   0x1f000000      /* Mask of the misc info.  */
 
 /* Convenience macros to document the instruction type.  */
@@ -2348,6 +2357,7 @@ extern int frame_pointer_needed;
 #undef RS6000_BUILTIN_A
 #undef RS6000_BUILTIN_D
 #undef RS6000_BUILTIN_H
+#undef RS6000_BUILTIN_M
 #undef RS6000_BUILTIN_P
 #undef RS6000_BUILTIN_X
 
@@ -2359,6 +2369,7 @@ extern int frame_pointer_needed;
 #define RS6000_BUILTIN_A(ENUM, NAME, MASK, ATTR, ICODE) ENUM,
 #define RS6000_BUILTIN_D(ENUM, NAME, MASK, ATTR, ICODE) ENUM,
 #define RS6000_BUILTIN_H(ENUM, NAME, MASK, ATTR, ICODE) ENUM,
+#define RS6000_BUILTIN_M(ENUM, NAME, MASK, ATTR, ICODE) ENUM,
 #define RS6000_BUILTIN_P(ENUM, NAME, MASK, ATTR, ICODE) ENUM,
 #define RS6000_BUILTIN_X(ENUM, NAME, MASK, ATTR, ICODE) ENUM,
 
@@ -2377,6 +2388,7 @@ enum rs6000_builtins
 #undef RS6000_BUILTIN_A
 #undef RS6000_BUILTIN_D
 #undef RS6000_BUILTIN_H
+#undef RS6000_BUILTIN_M
 #undef RS6000_BUILTIN_P
 #undef RS6000_BUILTIN_X
 
index f577bb0..67d7f38 100644 (file)
    vecsimple,veccomplex,vecdiv,veccmp,veccmpsimple,vecperm,
    vecfloat,vecfdiv,vecdouble,mffgpr,mftgpr,crypto,
    veclogical,veccmpfx,vecexts,vecmove,
-   htm,htmsimple,dfp"
+   htm,htmsimple,dfp,mma"
   (const_string "integer"))
 
 ;; What data size does this instruction work on?
index 2a28215..342927a 100644 (file)
    UNSPEC_VSX_DIVUD
    UNSPEC_VSX_MULSD
    UNSPEC_VSX_SIGN_EXTEND
+   UNSPEC_VSX_XVCVBF16SP
+   UNSPEC_VSX_XVCVSPBF16
    UNSPEC_VSX_XVCVSPSXDS
    UNSPEC_VSX_VSLO
    UNSPEC_VSX_EXTRACT
    UNSPEC_XXGENPCV
   ])
 
+(define_int_iterator XVCVBF16  [UNSPEC_VSX_XVCVSPBF16
+                                UNSPEC_VSX_XVCVBF16SP])
+
+(define_int_attr xvcvbf16       [(UNSPEC_VSX_XVCVSPBF16 "xvcvspbf16")
+                                (UNSPEC_VSX_XVCVBF16SP "xvcvbf16sp")])
+
 ;; VSX moves
 
 ;; The patterns for LE permuted loads and stores come before the general
   DONE;
 })
 
+(define_insn "vsx_<xvcvbf16>"
+  [(set (match_operand:V16QI 0 "vsx_register_operand" "=wa")
+       (unspec:V16QI [(match_operand:V16QI 1 "vsx_register_operand" "wa")]
+                     XVCVBF16))]
+  "TARGET_FUTURE"
+  "<xvcvbf16> %x0,%x1"
+  [(set_attr "type" "vecfloat")])
index 10dc32e..95f7192 100644 (file)
@@ -13858,6 +13858,7 @@ instructions, but allow the compiler to schedule those calls.
 * PowerPC AltiVec/VSX Built-in Functions::
 * PowerPC Hardware Transactional Memory Built-in Functions::
 * PowerPC Atomic Memory Operation Functions::
+* PowerPC Matrix-Multiply Assist Built-in Functions::
 * RX Built-in Functions::
 * S/390 System z Built-in Functions::
 * SH Built-in Functions::
@@ -21359,6 +21360,100 @@ void amo_stdat_smax (int64_t *, int64_t);
 void amo_stdat_smin (int64_t *, int64_t);
 @end smallexample
 
+@node PowerPC Matrix-Multiply Assist Built-in Functions
+@subsection PowerPC Matrix-Multiply Assist Built-in Functions
+ISA 3.1 of the PowerPC added new Matrix-Multiply Assist (MMA) instructions.
+GCC provides support for these instructions through the following built-in
+functions which are enabled with the @code{-mmma} option.  The vec_t type
+below is defined to be a normal vector unsigned char type.  The uint2, uint4
+and uint8 parameters are 2-bit, 4-bit and 8-bit unsigned integer constants
+respectively.  The compiler will verify that they are constants and that
+their values are within range. 
+
+The built-in functions supported are:
+
+@smallexample
+void __builtin_mma_xvi4ger8 (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvi8ger4 (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvi16ger2 (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvi16ger2s (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvf16ger2 (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvbf16ger2 (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvf32ger (__vector_quad *, vec_t, vec_t);
+
+void __builtin_mma_xvi4ger8pp (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvi8ger4pp (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvi8ger4spp(__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvi16ger2pp (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvi16ger2spp (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvf16ger2pp (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvf16ger2pn (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvf16ger2np (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvf16ger2nn (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvbf16ger2pp (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvbf16ger2pn (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvbf16ger2np (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvbf16ger2nn (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvf32gerpp (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvf32gerpn (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvf32gernp (__vector_quad *, vec_t, vec_t);
+void __builtin_mma_xvf32gernn (__vector_quad *, vec_t, vec_t);
+
+void __builtin_mma_pmxvi4ger8 (__vector_quad *, vec_t, vec_t, uint4, uint4, uint8);
+void __builtin_mma_pmxvi4ger8pp (__vector_quad *, vec_t, vec_t, uint4, uint4, uint8);
+
+void __builtin_mma_pmxvi8ger4 (__vector_quad *, vec_t, vec_t, uint4, uint4, uint4);
+void __builtin_mma_pmxvi8ger4pp (__vector_quad *, vec_t, vec_t, uint4, uint4, uint4);
+void __builtin_mma_pmxvi8ger4spp(__vector_quad *, vec_t, vec_t, uint4, uint4, uint4);
+
+void __builtin_mma_pmxvi16ger2 (__vector_quad *, vec_t, vec_t, uint4, uint4, uint2);
+void __builtin_mma_pmxvi16ger2s (__vector_quad *, vec_t, vec_t, uint4, uint4, uint2);
+void __builtin_mma_pmxvf16ger2 (__vector_quad *, vec_t, vec_t, uint4, uint4, uint2);
+void __builtin_mma_pmxvbf16ger2 (__vector_quad *, vec_t, vec_t, uint4, uint4, uint2);
+
+void __builtin_mma_pmxvi16ger2pp (__vector_quad *, vec_t, vec_t, uint4, uint4, uint2);
+void __builtin_mma_pmxvi16ger2spp (__vector_quad *, vec_t, vec_t, uint4, uint4, uint2);
+void __builtin_mma_pmxvf16ger2pp (__vector_quad *, vec_t, vec_t, uint4, uint4, uint2);
+void __builtin_mma_pmxvf16ger2pn (__vector_quad *, vec_t, vec_t, uint4, uint4, uint2);
+void __builtin_mma_pmxvf16ger2np (__vector_quad *, vec_t, vec_t, uint4, uint4, uint2);
+void __builtin_mma_pmxvf16ger2nn (__vector_quad *, vec_t, vec_t, uint4, uint4, uint2);
+void __builtin_mma_pmxvbf16ger2pp (__vector_quad *, vec_t, vec_t, uint4, uint4, uint2);
+void __builtin_mma_pmxvbf16ger2pn (__vector_quad *, vec_t, vec_t, uint4, uint4, uint2);
+void __builtin_mma_pmxvbf16ger2np (__vector_quad *, vec_t, vec_t, uint4, uint4, uint2);
+void __builtin_mma_pmxvbf16ger2nn (__vector_quad *, vec_t, vec_t, uint4, uint4, uint2);
+
+void __builtin_mma_pmxvf32ger (__vector_quad *, vec_t, vec_t, uint4, uint4);
+void __builtin_mma_pmxvf32gerpp (__vector_quad *, vec_t, vec_t, uint4, uint4);
+void __builtin_mma_pmxvf32gerpn (__vector_quad *, vec_t, vec_t, uint4, uint4);
+void __builtin_mma_pmxvf32gernp (__vector_quad *, vec_t, vec_t, uint4, uint4);
+void __builtin_mma_pmxvf32gernn (__vector_quad *, vec_t, vec_t, uint4, uint4);
+
+void __builtin_mma_xvf64ger (__vector_quad *, __vector_pair, vec_t);
+void __builtin_mma_xvf64gerpp (__vector_quad *, __vector_pair, vec_t);
+void __builtin_mma_xvf64gerpn (__vector_quad *, __vector_pair, vec_t);
+void __builtin_mma_xvf64gernp (__vector_quad *, __vector_pair, vec_t);
+void __builtin_mma_xvf64gernn (__vector_quad *, __vector_pair, vec_t);
+
+void __builtin_mma_pmxvf64ger (__vector_quad *, __vector_pair, vec_t, uint4, uint2);
+void __builtin_mma_pmxvf64gerpp (__vector_quad *, __vector_pair, vec_t, uint4, uint2);
+void __builtin_mma_pmxvf64gerpn (__vector_quad *, __vector_pair, vec_t, uint4, uint2);
+void __builtin_mma_pmxvf64gernp (__vector_quad *, __vector_pair, vec_t, uint4, uint2);
+void __builtin_mma_pmxvf64gernn (__vector_quad *, __vector_pair, vec_t, uint4, uint2);
+
+void __builtin_mma_xxmtacc (__vector_quad *);
+void __builtin_mma_xxmfacc (__vector_quad *);
+void __builtin_mma_xxsetaccz (__vector_quad *);
+
+void __builtin_mma_assemble_acc (__vector_quad *, vec_t, vec_t, vec_t, vec_t);
+void __builtin_mma_disassemble_acc (void *, __vector_quad *);
+
+void __builtin_mma_assemble_pair (__vector_pair *, vec_t, vec_t);
+void __builtin_mma_disassemble_pair (void *, __vector_pair *);
+
+vec_t __builtin_vsx_xvcvspbf16 (vec_t);
+vec_t __builtin_vsx_xvcvbf16sp (vec_t);
+@end smallexample
+
 @node RX Built-in Functions
 @subsection RX Built-in Functions
 GCC supports some of the RX instructions which cannot be expressed in
diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
new file mode 100644 (file)
index 0000000..a971c86
--- /dev/null
@@ -0,0 +1,313 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_future_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=future -O2" } */
+
+typedef unsigned char  vec_t __attribute__((vector_size(16)));
+
+void
+foo0 (__vector_quad *dst, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  __builtin_mma_xvi4ger8 (&acc, vec0, vec1);
+  __builtin_mma_xvi4ger8pp (&acc, vec0, vec1);
+  dst[0] = acc;
+}
+
+void
+foo1 (__vector_quad *dst, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  __builtin_mma_xvi8ger4 (&acc, vec0, vec1);
+  __builtin_mma_xvi8ger4pp (&acc, vec0, vec1);
+  __builtin_mma_xvi8ger4spp(&acc, vec0, vec1);
+  dst[1] = acc;
+}
+
+void
+foo2 (__vector_quad *dst, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  __builtin_mma_xvi16ger2 (&acc, vec0, vec1);
+  __builtin_mma_xvi16ger2pp (&acc, vec0, vec1);
+  dst[2] = acc;
+}
+
+void
+foo3 (__vector_quad *dst, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  __builtin_mma_xvi16ger2s (&acc, vec0, vec1);
+  __builtin_mma_xvi16ger2spp (&acc, vec0, vec1);
+  dst[3] = acc;
+}
+
+void
+foo4 (__vector_quad *dst, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  __builtin_mma_xvf16ger2 (&acc, vec0, vec1);
+  __builtin_mma_xvf16ger2pp (&acc, vec0, vec1);
+  __builtin_mma_xvf16ger2pn (&acc, vec0, vec1);
+  dst[4] = acc;
+}
+
+void
+foo4b (__vector_quad *dst, __vector_quad *src, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  acc = src[0];
+  __builtin_mma_xvf16ger2np (&acc, vec0, vec1);
+  __builtin_mma_xvf16ger2nn (&acc, vec0, vec1);
+  dst[4] = acc;
+}
+
+void
+foo5 (__vector_quad *dst, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  __builtin_mma_xvbf16ger2 (&acc, vec0, vec1);
+  __builtin_mma_xvbf16ger2pp (&acc, vec0, vec1);
+  __builtin_mma_xvbf16ger2pn (&acc, vec0, vec1);
+  dst[5] = acc;
+}
+
+void
+foo5b (__vector_quad *dst, __vector_quad *src, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  acc = src[0];
+  __builtin_mma_xvbf16ger2np (&acc, vec0, vec1);
+  __builtin_mma_xvbf16ger2nn (&acc, vec0, vec1);
+  dst[5] = acc;
+}
+
+void
+foo6 (__vector_quad *dst, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  __builtin_mma_xvf32ger (&acc, vec0, vec1);
+  __builtin_mma_xvf32gerpp (&acc, vec0, vec1);
+  __builtin_mma_xvf32gerpn (&acc, vec0, vec1);
+  dst[6] = acc;
+}
+
+void
+foo6b (__vector_quad *dst, __vector_quad *src, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  acc = src[0];
+  __builtin_mma_xvf32gernp (&acc, vec0, vec1);
+  __builtin_mma_xvf32gernn (&acc, vec0, vec1);
+  dst[6] = acc;
+}
+
+void
+foo7 (__vector_quad *dst, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  __builtin_mma_pmxvi4ger8 (&acc, vec0, vec1, 15, 15, 255);
+  __builtin_mma_pmxvi4ger8pp (&acc, vec0, vec1, 15, 15, 255);
+  dst[7] = acc;
+}
+
+void
+foo8 (__vector_quad *dst, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  __builtin_mma_pmxvi8ger4 (&acc, vec0, vec1, 15, 15, 15);
+  __builtin_mma_pmxvi8ger4pp (&acc, vec0, vec1, 15, 15, 15);
+  __builtin_mma_pmxvi8ger4spp(&acc, vec0, vec1, 15, 15, 15);
+  dst[8] = acc;
+}
+
+void
+foo9 (__vector_quad *dst, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  __builtin_mma_pmxvi16ger2 (&acc, vec0, vec1, 15, 15, 3);
+  __builtin_mma_pmxvi16ger2pp (&acc, vec0, vec1, 15, 15, 3);
+  dst[9] = acc;
+}
+
+void
+foo10 (__vector_quad *dst, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  __builtin_mma_pmxvi16ger2s (&acc, vec0, vec1, 15, 15, 3);
+  __builtin_mma_pmxvi16ger2spp (&acc, vec0, vec1, 15, 15, 3);
+  dst[10] = acc;
+}
+
+void
+foo11 (__vector_quad *dst, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  __builtin_mma_pmxvf16ger2 (&acc, vec0, vec1, 15, 15, 3);
+  __builtin_mma_pmxvf16ger2pp (&acc, vec0, vec1, 15, 15, 3);
+  __builtin_mma_pmxvf16ger2pn (&acc, vec0, vec1, 15, 15, 3);
+  dst[11] = acc;
+}
+
+void
+foo11b (__vector_quad *dst, __vector_quad *src, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  acc = src[0];
+  __builtin_mma_pmxvf16ger2np (&acc, vec0, vec1, 15, 15, 3);
+  __builtin_mma_pmxvf16ger2nn (&acc, vec0, vec1, 15, 15, 3);
+  dst[11] = acc;
+}
+
+void
+foo12 (__vector_quad *dst, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  __builtin_mma_pmxvbf16ger2 (&acc, vec0, vec1, 15, 15, 3);
+  __builtin_mma_pmxvbf16ger2pp (&acc, vec0, vec1, 15, 15, 3);
+  __builtin_mma_pmxvbf16ger2pn (&acc, vec0, vec1, 15, 15, 3);
+  dst[12] = acc;
+}
+
+void
+foo12b (__vector_quad *dst, __vector_quad *src, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  acc = src[0];
+  __builtin_mma_pmxvbf16ger2np (&acc, vec0, vec1, 15, 15, 3);
+  __builtin_mma_pmxvbf16ger2nn (&acc, vec0, vec1, 15, 15, 3);
+  dst[12] = acc;
+}
+
+void
+foo13 (__vector_quad *dst, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  __builtin_mma_pmxvf32ger (&acc, vec0, vec1, 15, 15);
+  __builtin_mma_pmxvf32gerpp (&acc, vec0, vec1, 15, 15);
+  __builtin_mma_pmxvf32gerpn (&acc, vec0, vec1, 15, 15);
+  dst[13] = acc;
+}
+
+void
+foo13b (__vector_quad *dst, __vector_quad *src, vec_t *vec)
+{
+  __vector_quad acc;
+  vec_t vec0 = vec[0];
+  vec_t vec1 = vec[1];
+
+  acc = src[0];
+  __builtin_mma_pmxvf32gernp (&acc, vec0, vec1, 15, 15);
+  __builtin_mma_pmxvf32gernn (&acc, vec0, vec1, 15, 15);
+  dst[13] = acc;
+}
+
+/* { dg-final { scan-assembler-times {\mlxv\M} 40 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 40 } } */
+/* { dg-final { scan-assembler-times {\mxxmfacc\M} 20 } } */
+/* { dg-final { scan-assembler-times {\mxxmtacc\M} 6 } } */
+/* { dg-final { scan-assembler-times {\mxvbf16ger2\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvbf16ger2nn\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvbf16ger2np\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvbf16ger2pn\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvbf16ger2pp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvf16ger2\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvf16ger2nn\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvf16ger2np\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvf16ger2pn\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvf16ger2pp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvf32ger\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvf32gernn\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvf32gernp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvf32gerpn\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvf32gerpp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvi16ger2\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvi16ger2pp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvi16ger2s\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvi16ger2spp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvi4ger8\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvi4ger8pp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvi8ger4\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvi8ger4pp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvi8ger4spp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvbf16ger2\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvbf16ger2nn\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvbf16ger2np\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvbf16ger2pn\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvbf16ger2pp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvf16ger2\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvf16ger2nn\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvf16ger2np\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvf16ger2pn\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvf16ger2pp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvf32ger\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvf32gernn\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvf32gernp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvf32gerpn\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvf32gerpp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvi16ger2\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvi16ger2pp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvi16ger2s\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvi16ger2spp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvi4ger8\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvi4ger8pp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvi8ger4\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvi8ger4pp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvi8ger4spp\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-2.c b/gcc/testsuite/gcc.target/powerpc/mma-builtin-2.c
new file mode 100644 (file)
index 0000000..cb8b30d
--- /dev/null
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_future_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=future -O2" } */
+
+typedef unsigned char  vec_t __attribute__((vector_size(16)));
+
+void
+foo0 (__vector_quad *dst, vec_t *vec, __vector_pair *pvecp)
+{
+  __vector_quad acc;
+  __vector_pair vecp0 = *pvecp;
+  vec_t vec1 = vec[1];
+
+  __builtin_mma_xvf64ger (&acc, vecp0, vec1);
+  __builtin_mma_xvf64gerpp (&acc, vecp0, vec1);
+  __builtin_mma_xvf64gerpn (&acc, vecp0, vec1);
+  dst[0] = acc;
+}
+
+void
+foo1 (__vector_quad *dst, __vector_quad *src, vec_t *vec, __vector_pair *pvecp)
+{
+  __vector_quad acc;
+  __vector_pair vecp0 = *pvecp;
+  vec_t vec1 = vec[1];
+
+  acc = src[0];
+  __builtin_mma_xvf64gernp (&acc, vecp0, vec1);
+  __builtin_mma_xvf64gernn (&acc, vecp0, vec1);
+  dst[0] = acc;
+}
+
+void
+foo2 (__vector_quad *dst, vec_t *vec, __vector_pair *pvecp)
+{
+  __vector_quad acc;
+  __vector_pair vecp0 = *pvecp;
+  vec_t vec1 = vec[1];
+  __builtin_mma_pmxvf64ger (&acc, vecp0, vec1, 15, 3);
+  __builtin_mma_pmxvf64gerpp (&acc, vecp0, vec1, 15, 3);
+  __builtin_mma_pmxvf64gerpn (&acc, vecp0, vec1, 15, 3);
+  dst[1] = acc;
+}
+
+void
+foo3 (__vector_quad *dst, __vector_quad *src, vec_t *vec, __vector_pair *pvecp)
+{
+  __vector_quad acc;
+  __vector_pair vecp0 = *pvecp;
+  vec_t vec1 = vec[1];
+
+  acc = src[0];
+  __builtin_mma_pmxvf64gernp (&acc, vecp0, vec1, 15, 3);
+  __builtin_mma_pmxvf64gernn (&acc, vecp0, vec1, 15, 3);
+  dst[1] = acc;
+}
+
+/* { dg-final { scan-assembler-times {\mxxmfacc\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxmtacc\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mlxv\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 8 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 8 } } */
+/* { dg-final { scan-assembler-times {\mxvf64ger\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvf64gerpp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvf64gerpn\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvf64gernp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvf64gernn\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvf64ger\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvf64gerpp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvf64gerpn\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvf64gernp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mpmxvf64gernn\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-3.c b/gcc/testsuite/gcc.target/powerpc/mma-builtin-3.c
new file mode 100644 (file)
index 0000000..5406707
--- /dev/null
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_future_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=future -O2" } */
+
+void
+foo0 (void)
+{
+  __vector_quad acc;
+  asm ("#..." : "=d" (acc));
+  __builtin_mma_xxmtacc (&acc);
+  __builtin_mma_xxmfacc (&acc);
+  asm ("#..." :: "d" (acc));
+}
+
+typedef unsigned char  vec_t __attribute__((vector_size(16)));
+
+void
+foo1 (vec_t *vec)
+{
+  vec[1] = __builtin_vsx_xvcvspbf16 (vec[0]);
+  vec[3] = __builtin_vsx_xvcvbf16sp (vec[2]);
+}
+
+/* { dg-final { scan-assembler-times {\mxxmtacc\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxxmfacc\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mlxv\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstxv\M} 2 } } */
+/* { dg-final { scan-assembler-not {\mlxvp\M} } } */
+/* { dg-final { scan-assembler-not {\mstxvp\M} } } */
+/* { dg-final { scan-assembler-times {\mxvcvspbf16\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvcvbf16sp\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-4.c b/gcc/testsuite/gcc.target/powerpc/mma-builtin-4.c
new file mode 100644 (file)
index 0000000..138d1b4
--- /dev/null
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_future_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=future -O2" } */
+
+typedef unsigned char vec_t __attribute__((vector_size(16)));
+
+void
+foo (__vector_pair *dst, vec_t *src)
+{
+  __vector_pair pair;
+  __builtin_mma_assemble_pair (&pair, src[0], src[4]);
+  *dst = pair;
+}
+
+void
+bar (vec_t *dst, __vector_pair *src)
+{
+  vec_t res[2];
+  __builtin_mma_disassemble_pair (res, src);
+  dst[0] = res[0];
+  dst[4] = res[1];
+}
+
+/* { dg-final { scan-assembler-times {\mlxv\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mstxv\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 1 } } */
+
diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-5.c b/gcc/testsuite/gcc.target/powerpc/mma-builtin-5.c
new file mode 100644 (file)
index 0000000..0ee45b6
--- /dev/null
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_future_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=future -O2" } */
+
+typedef unsigned char vec_t __attribute__((vector_size(16)));
+
+void
+foo (__vector_quad *dst, vec_t *src)
+{
+  __vector_quad acc;
+  __builtin_mma_assemble_acc (&acc, src[0], src[4], src[8], src[12]);
+  *dst = acc;
+}
+
+void
+bar (vec_t *dst, __vector_quad *src)
+{
+  vec_t res[4];
+  __builtin_mma_disassemble_acc (res, src);
+  dst[0] = res[0];
+  dst[4] = res[1];
+  dst[8] = res[2];
+  dst[12] = res[3];
+}
+
+/* { dg-final { scan-assembler-times {\mlxv\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstxv\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxmfacc\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxmtacc\M} 2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-6.c b/gcc/testsuite/gcc.target/powerpc/mma-builtin-6.c
new file mode 100644 (file)
index 0000000..c0b5eed
--- /dev/null
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_future_ok } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=future -O2" } */
+
+void
+foo (__vector_quad *dst)
+{
+  __vector_quad acc;
+  __builtin_mma_xxsetaccz (&acc);
+  *dst = acc;
+}
+
+/* { dg-final { scan-assembler-not {\mlxv\M} } } */
+/* { dg-final { scan-assembler-not {\mlxvp\M} } } */
+/* { dg-final { scan-assembler-not {\mxxmtacc\M} } } */
+/* { dg-final { scan-assembler-times {\mxxsetaccz\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxxmfacc\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 2 } } */