From 025f434a87336e38bf5140fba2005081876aa911 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Thu, 5 Nov 2020 00:04:10 -0600 Subject: [PATCH] rs6000: Use direct move for char/short vector CTOR [PR96933] This patch is to make vector CTOR with char/short leverage direct move instructions when they are available. With one constructed test case, it can speed up 145% for char and 190% for short on P9. Tested SPEC2017 x264_r at -Ofast on P9, it gets 1.61% speedup (but based on unexpected SLP see PR96789). Bootstrapped/regtested on powerpc64{,le}-linux-gnu P8 and powerpc64le-linux-gnu P9. gcc/ChangeLog: PR target/96933 * config/rs6000/rs6000.c (rs6000_expand_vector_init): Use direct move instructions for vector construction with char/short types. * config/rs6000/rs6000.md (p8_mtvsrwz_v16qisi2): New define_insn. (p8_mtvsrd_v16qidi2): Likewise. gcc/testsuite/ChangeLog: PR target/96933 * gcc.target/powerpc/pr96933-1.c: New test. * gcc.target/powerpc/pr96933-2.c: New test. * gcc.target/powerpc/pr96933-3.c: New test. * gcc.target/powerpc/pr96933-4.c: New test. * gcc.target/powerpc/pr96933.h: New test. * gcc.target/powerpc/pr96933-run.h: New test. --- gcc/config/rs6000/rs6000.c | 179 ++++++++++++++++++++++++- gcc/config/rs6000/rs6000.md | 16 +++ gcc/testsuite/gcc.target/powerpc/pr96933-1.c | 15 +++ gcc/testsuite/gcc.target/powerpc/pr96933-2.c | 15 +++ gcc/testsuite/gcc.target/powerpc/pr96933-3.c | 10 ++ gcc/testsuite/gcc.target/powerpc/pr96933-4.c | 10 ++ gcc/testsuite/gcc.target/powerpc/pr96933-run.h | 56 ++++++++ gcc/testsuite/gcc.target/powerpc/pr96933.h | 50 +++++++ 8 files changed, 349 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr96933-1.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr96933-2.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr96933-3.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr96933-4.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr96933-run.h create mode 100644 gcc/testsuite/gcc.target/powerpc/pr96933.h diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 1e506b8..e613353 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -6522,11 +6522,11 @@ rs6000_expand_vector_init (rtx target, rtx vals) { machine_mode mode = GET_MODE (target); machine_mode inner_mode = GET_MODE_INNER (mode); - int n_elts = GET_MODE_NUNITS (mode); + unsigned int n_elts = GET_MODE_NUNITS (mode); int n_var = 0, one_var = -1; bool all_same = true, all_const_zero = true; rtx x, mem; - int i; + unsigned int i; for (i = 0; i < n_elts; ++i) { @@ -6792,6 +6792,181 @@ rs6000_expand_vector_init (rtx target, rtx vals) return; } + if (TARGET_DIRECT_MOVE && (mode == V16QImode || mode == V8HImode)) + { + rtx op[16]; + /* Force the values into word_mode registers. */ + for (i = 0; i < n_elts; i++) + { + rtx tmp = force_reg (GET_MODE_INNER (mode), XVECEXP (vals, 0, i)); + if (TARGET_64BIT) + { + op[i] = gen_reg_rtx (DImode); + emit_insn (gen_zero_extendqidi2 (op[i], tmp)); + } + else + { + op[i] = gen_reg_rtx (SImode); + emit_insn (gen_zero_extendqisi2 (op[i], tmp)); + } + } + + /* Take unsigned char big endianness on 64bit as example for below + construction, the input values are: A, B, C, D, ..., O, P. */ + + if (TARGET_DIRECT_MOVE_128) + { + /* Move to VSX register with vec_concat, each has 2 values. + eg: vr1[0] = { xxxxxxxA, xxxxxxxB }; + vr1[1] = { xxxxxxxC, xxxxxxxD }; + ... + vr1[7] = { xxxxxxxO, xxxxxxxP }; */ + rtx vr1[8]; + for (i = 0; i < n_elts / 2; i++) + { + vr1[i] = gen_reg_rtx (V2DImode); + emit_insn (gen_vsx_concat_v2di (vr1[i], op[i * 2], + op[i * 2 + 1])); + } + + /* Pack vectors with 2 values into vectors with 4 values. + eg: vr2[0] = { xxxAxxxB, xxxCxxxD }; + vr2[1] = { xxxExxxF, xxxGxxxH }; + vr2[1] = { xxxIxxxJ, xxxKxxxL }; + vr2[3] = { xxxMxxxN, xxxOxxxP }; */ + rtx vr2[4]; + for (i = 0; i < n_elts / 4; i++) + { + vr2[i] = gen_reg_rtx (V4SImode); + emit_insn (gen_altivec_vpkudum (vr2[i], vr1[i * 2], + vr1[i * 2 + 1])); + } + + /* Pack vectors with 4 values into vectors with 8 values. + eg: vr3[0] = { xAxBxCxD, xExFxGxH }; + vr3[1] = { xIxJxKxL, xMxNxOxP }; */ + rtx vr3[2]; + for (i = 0; i < n_elts / 8; i++) + { + vr3[i] = gen_reg_rtx (V8HImode); + emit_insn (gen_altivec_vpkuwum (vr3[i], vr2[i * 2], + vr2[i * 2 + 1])); + } + + /* If it's V8HImode, it's done and return it. */ + if (mode == V8HImode) + { + emit_insn (gen_rtx_SET (target, vr3[0])); + return; + } + + /* Pack vectors with 8 values into 16 values. */ + rtx res = gen_reg_rtx (V16QImode); + emit_insn (gen_altivec_vpkuhum (res, vr3[0], vr3[1])); + emit_insn (gen_rtx_SET (target, res)); + } + else + { + rtx (*merge_v16qi) (rtx, rtx, rtx) = NULL; + rtx (*merge_v8hi) (rtx, rtx, rtx) = NULL; + rtx (*merge_v4si) (rtx, rtx, rtx) = NULL; + rtx perm_idx; + + /* Set up some common gen routines and values. */ + if (BYTES_BIG_ENDIAN) + { + if (mode == V16QImode) + { + merge_v16qi = gen_altivec_vmrghb; + merge_v8hi = gen_altivec_vmrglh; + } + else + merge_v8hi = gen_altivec_vmrghh; + + merge_v4si = gen_altivec_vmrglw; + perm_idx = GEN_INT (3); + } + else + { + if (mode == V16QImode) + { + merge_v16qi = gen_altivec_vmrglb; + merge_v8hi = gen_altivec_vmrghh; + } + else + merge_v8hi = gen_altivec_vmrglh; + + merge_v4si = gen_altivec_vmrghw; + perm_idx = GEN_INT (0); + } + + /* Move to VSX register with direct move. + eg: vr_qi[0] = { xxxxxxxA, xxxxxxxx }; + vr_qi[1] = { xxxxxxxB, xxxxxxxx }; + ... + vr_qi[15] = { xxxxxxxP, xxxxxxxx }; */ + rtx vr_qi[16]; + for (i = 0; i < n_elts; i++) + { + vr_qi[i] = gen_reg_rtx (V16QImode); + if (TARGET_64BIT) + emit_insn (gen_p8_mtvsrd_v16qidi2 (vr_qi[i], op[i])); + else + emit_insn (gen_p8_mtvsrwz_v16qisi2 (vr_qi[i], op[i])); + } + + /* Merge/move to vector short. + eg: vr_hi[0] = { xxxxxxxx, xxxxxxAB }; + vr_hi[1] = { xxxxxxxx, xxxxxxCD }; + ... + vr_hi[7] = { xxxxxxxx, xxxxxxOP }; */ + rtx vr_hi[8]; + for (i = 0; i < 8; i++) + { + rtx tmp = vr_qi[i]; + if (mode == V16QImode) + { + tmp = gen_reg_rtx (V16QImode); + emit_insn (merge_v16qi (tmp, vr_qi[2 * i], vr_qi[2 * i + 1])); + } + vr_hi[i] = gen_reg_rtx (V8HImode); + emit_move_insn (vr_hi[i], gen_lowpart (V8HImode, tmp)); + } + + /* Merge vector short to vector int. + eg: vr_si[0] = { xxxxxxxx, xxxxABCD }; + vr_si[1] = { xxxxxxxx, xxxxEFGH }; + ... + vr_si[3] = { xxxxxxxx, xxxxMNOP }; */ + rtx vr_si[4]; + for (i = 0; i < 4; i++) + { + rtx tmp = gen_reg_rtx (V8HImode); + emit_insn (merge_v8hi (tmp, vr_hi[2 * i], vr_hi[2 * i + 1])); + vr_si[i] = gen_reg_rtx (V4SImode); + emit_move_insn (vr_si[i], gen_lowpart (V4SImode, tmp)); + } + + /* Merge vector int to vector long. + eg: vr_di[0] = { xxxxxxxx, ABCDEFGH }; + vr_di[1] = { xxxxxxxx, IJKLMNOP }; */ + rtx vr_di[2]; + for (i = 0; i < 2; i++) + { + rtx tmp = gen_reg_rtx (V4SImode); + emit_insn (merge_v4si (tmp, vr_si[2 * i], vr_si[2 * i + 1])); + vr_di[i] = gen_reg_rtx (V2DImode); + emit_move_insn (vr_di[i], gen_lowpart (V2DImode, tmp)); + } + + rtx res = gen_reg_rtx (V2DImode); + emit_insn (gen_vsx_xxpermdi_v2di (res, vr_di[0], vr_di[1], perm_idx)); + emit_insn (gen_rtx_SET (target, gen_lowpart (mode, res))); + } + + return; + } + /* Construct the vector in memory one field at a time and load the whole vector. */ mem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index dc06014..acbf130 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -8713,6 +8713,22 @@ "mtvsrwz %x0,%1" [(set_attr "type" "mtvsr")]) +(define_insn "p8_mtvsrwz_v16qisi2" + [(set (match_operand:V16QI 0 "register_operand" "=wa") + (unspec:V16QI [(match_operand:SI 1 "register_operand" "r")] + UNSPEC_P8V_MTVSRWZ))] + "!TARGET_POWERPC64 && TARGET_DIRECT_MOVE" + "mtvsrwz %x0,%1" + [(set_attr "type" "mtvsr")]) + +(define_insn "p8_mtvsrd_v16qidi2" + [(set (match_operand:V16QI 0 "register_operand" "=wa") + (unspec:V16QI [(match_operand:DI 1 "register_operand" "r")] + UNSPEC_P8V_MTVSRD))] + "TARGET_POWERPC64 && TARGET_DIRECT_MOVE" + "mtvsrd %x0,%1" + [(set_attr "type" "mtvsr")]) + (define_insn_and_split "reload_fpr_from_gpr" [(set (match_operand:FMOVE64X 0 "register_operand" "=d") (unspec:FMOVE64X [(match_operand:FMOVE64X 1 "register_operand" "r")] diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-1.c b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c new file mode 100644 index 0000000..3b63865 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-require-effective-target powerpc_p9vector_ok } */ +/* { dg-options "-O2 -mdejagnu-cpu=power9" } */ + +/* Test vector constructions with char/short type values whether use 128bit + direct move instructions mtvsrdd on Power9 or later, rather than transfering + with memory store/load with stb/sth and vector load. */ + +#include "pr96933.h" + +/* { dg-final { scan-assembler-times {\mmtvsrdd\M} 24 } } */ +/* { dg-final { scan-assembler-times {\mvpkudum\M} 12 } } */ +/* { dg-final { scan-assembler-not {\mstb\M} } } */ +/* { dg-final { scan-assembler-not {\msth\M} } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-2.c b/gcc/testsuite/gcc.target/powerpc/pr96933-2.c new file mode 100644 index 0000000..cef8fbd --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr96933-2.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target powerpc_p8vector_ok } */ +/* { dg-options "-O2 -mdejagnu-cpu=power8" } */ + +/* Test vector constructions with char/short type values whether use direct + move instructions like mtvsrd/mtvsrwz on Power8, rather than transfering + with memory store/load with stb/sth and vector load. */ + +#include "pr96933.h" + +/* { dg-final { scan-assembler-times {\mmtvsrd\M} 48 {target lp64 } } } */ +/* { dg-final { scan-assembler-times {\mmtvsrwz\M} 48 {target {! lp64 } } } } */ +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 4 } } */ +/* { dg-final { scan-assembler-not {\mstb\M} } } */ +/* { dg-final { scan-assembler-not {\msth\M} } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-3.c b/gcc/testsuite/gcc.target/powerpc/pr96933-3.c new file mode 100644 index 0000000..3e5709a --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr96933-3.c @@ -0,0 +1,10 @@ +/* { dg-do run } */ +/* { dg-require-effective-target p8vector_hw } */ +/* { dg-options "-O2 -mdejagnu-cpu=power8" } */ + +/* Test vector constructions with char/short run successfully on Power8. */ + +#include +#include "pr96933.h" +#include "pr96933-run.h" + diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-4.c b/gcc/testsuite/gcc.target/powerpc/pr96933-4.c new file mode 100644 index 0000000..5a1c3d0 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr96933-4.c @@ -0,0 +1,10 @@ +/* { dg-do run } */ +/* { dg-require-effective-target p9vector_hw } */ +/* { dg-options "-O2 -mdejagnu-cpu=power9" } */ + +/* Test vector constructions with char/short run successfully on Power9. */ + +#include +#include "pr96933.h" +#include "pr96933-run.h" + diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-run.h b/gcc/testsuite/gcc.target/powerpc/pr96933-run.h new file mode 100644 index 0000000..7fa8dac --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr96933-run.h @@ -0,0 +1,56 @@ +/* Test function for pr96933-{3,4}.c run result verification. */ + +int +main () +{ + unsigned char uc[16]; + signed char sc[16]; + + for (int i = 0; i < 16; i++) + { + uc[i] = (unsigned char) (i * 2 + 1); + sc[i] = (signed char) ((i % 2 == 0) ? (i + 1) : -i); + } + + vector unsigned char ucv + = test_uchar (uc[0], uc[1], uc[2], uc[3], uc[4], uc[5], uc[6], uc[7], uc[8], + uc[9], uc[10], uc[11], uc[12], uc[13], uc[14], uc[15]); + vector signed char scv + = test_schar (sc[0], sc[1], sc[2], sc[3], sc[4], sc[5], sc[6], sc[7], sc[8], + sc[9], sc[10], sc[11], sc[12], sc[13], sc[14], sc[15]); + + for (int i = 0; i < 16; i++) + { + unsigned char uexp = (unsigned char) (i * 2 + 1); + signed char sexp = (signed char) ((i % 2 == 0) ? (i + 1) : -i); + if (ucv[i] != uexp) + abort (); + if (scv[i] != sexp) + abort (); + } + + unsigned short us[8]; + signed short ss[8]; + for (int i = 0; i < 8; i++) + { + us[i] = (unsigned short) (i * 2 + 1); + ss[i] = (signed short) ((i % 2 == 0) ? (i + 1) : -i); + } + + vector unsigned short usv + = test_ushort (us[0], us[1], us[2], us[3], us[4], us[5], us[6], us[7]); + vector signed short ssv + = test_sshort (ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7]); + + for (int i = 0; i < 8; i++) + { + unsigned short uexp = (unsigned short) (i * 2 + 1); + signed short sexp = (signed short) ((i % 2 == 0) ? (i + 1) : -i); + if (usv[i] != uexp) + abort (); + if (ssv[i] != sexp) + abort (); + } + + return 0; +} diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933.h b/gcc/testsuite/gcc.target/powerpc/pr96933.h new file mode 100644 index 0000000..4bc2b94 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr96933.h @@ -0,0 +1,50 @@ +/* Source file for pr96933-*.c testing, this mainly contains 4 + functions as below: + + - test_uchar // vector unsigned char + - test_schar // vector signed char + - test_ushort // vector unsigned short + - test_sshort // vector signed short +*/ + +__attribute__ ((noipa)) vector unsigned char +test_uchar (unsigned char c1, unsigned char c2, unsigned char c3, + unsigned char c4, unsigned char c5, unsigned char c6, + unsigned char c7, unsigned char c8, unsigned char c9, + unsigned char c10, unsigned char c11, unsigned char c12, + unsigned char c13, unsigned char c14, unsigned char c15, + unsigned char c16) +{ + vector unsigned char v + = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16}; + return v; +} + +__attribute__ ((noipa)) vector signed char +test_schar (signed char c1, signed char c2, signed char c3, signed char c4, + signed char c5, signed char c6, signed char c7, signed char c8, + signed char c9, signed char c10, signed char c11, signed char c12, + signed char c13, signed char c14, signed char c15, signed char c16) +{ + vector signed char v + = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16}; + return v; +} + +__attribute__ ((noipa)) vector unsigned short +test_ushort (unsigned short c1, unsigned short c2, unsigned short c3, + unsigned short c4, unsigned short c5, unsigned short c6, + unsigned short c7, unsigned short c8) +{ + vector unsigned short v = {c1, c2, c3, c4, c5, c6, c7, c8}; + return v; +} + +__attribute__ ((noipa)) vector signed short +test_sshort (signed short c1, signed short c2, signed short c3, + signed short c4, signed short c5, signed short c6, + signed short c7, signed short c8) +{ + vector signed short v = {c1, c2, c3, c4, c5, c6, c7, c8}; + return v; +} -- 2.7.4