x86: Update memcpy/memset inline strategies for Skylake family CPUs
authorH.J. Lu <hjl.tools@gmail.com>
Fri, 12 Mar 2021 00:56:26 +0000 (16:56 -0800)
committerH.J. Lu <hjl.tools@gmail.com>
Tue, 6 Apr 2021 12:36:00 +0000 (05:36 -0700)
Simply memcpy and memset inline strategies to avoid branches for
Skylake family CPUs:

1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
   load and store for up to 16 * 16 (256) bytes when the data size is
   fixed and known.
2. Inline only if data size is known to be <= 256.
   a. Use "rep movsb/stosb" with simple code sequence if the data size
      is a constant.
   b. Use loop if data size is not a constant.
3. Use memcpy/memset libray function if data size is unknown or > 256.

On Cascadelake processor with -march=native -Ofast -flto,

1. Performance impacts of SPEC CPU 2017 rate are:

500.perlbench_r  0.17%
502.gcc_r       -0.36%
505.mcf_r        0.00%
520.omnetpp_r    0.08%
523.xalancbmk_r -0.62%
525.x264_r       1.04%
531.deepsjeng_r  0.11%
541.leela_r     -1.09%
548.exchange2_r -0.25%
557.xz_r         0.17%
Geomean         -0.08%

503.bwaves_r     0.00%
507.cactuBSSN_r  0.69%
508.namd_r      -0.07%
510.parest_r     1.12%
511.povray_r     1.82%
519.lbm_r        0.00%
521.wrf_r       -1.32%
526.blender_r   -0.47%
527.cam4_r       0.23%
538.imagick_r   -1.72%
544.nab_r       -0.56%
549.fotonik3d_r  0.12%
554.roms_r       0.43%
Geomean          0.02%

2. Significant impacts on eembc benchmarks are:

eembc/idctrn01   9.23%
eembc/nnet_test  29.26%

gcc/

* config/i386/x86-tune-costs.h (skylake_memcpy): Updated.
(skylake_memset): Likewise.
(skylake_cost): Change CLEAR_RATIO to 17.
* config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
Replace m_CANNONLAKE, m_ICELAKE_CLIENT, m_ICELAKE_SERVER,
m_TIGERLAKE and m_SAPPHIRERAPIDS with m_SKYLAKE and m_CORE_AVX512.

gcc/testsuite/

* gcc.target/i386/memcpy-strategy-9.c: New test.
* gcc.target/i386/memcpy-strategy-10.c: Likewise.
* gcc.target/i386/memcpy-strategy-11.c: Likewise.
* gcc.target/i386/memset-strategy-7.c: Likewise.
* gcc.target/i386/memset-strategy-8.c: Likewise.
* gcc.target/i386/memset-strategy-9.c: Likewise.

gcc/config/i386/x86-tune-costs.h
gcc/config/i386/x86-tune.def
gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-strategy-7.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-strategy-8.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/memset-strategy-9.c [new file with mode: 0644]

index 0e00ff9..ffe810f 100644 (file)
@@ -1822,17 +1822,24 @@ struct processor_costs znver3_cost = {
 
 /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
 static stringop_algs skylake_memcpy[2] =   {
-  {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
-  {libcall, {{16, loop, false}, {512, unrolled_loop, false},
-             {-1, libcall, false}}}};
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}},
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}}};
 
 static stringop_algs skylake_memset[2] = {
-  {libcall, {{6, loop_1_byte, true},
-             {24, loop, true},
-             {8192, rep_prefix_4_byte, true},
-             {-1, libcall, false}}},
-  {libcall, {{24, loop, true}, {512, unrolled_loop, false},
-             {-1, libcall, false}}}};
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}},
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}}};
 
 static const
 struct processor_costs skylake_cost = {
@@ -1889,7 +1896,7 @@ struct processor_costs skylake_cost = {
   COSTS_N_INSNS (0),                   /* cost of movzx */
   8,                                   /* "large" insn */
   17,                                  /* MOVE_RATIO */
-  6,                                   /* CLEAR_RATIO */
+  17,                                  /* CLEAR_RATIO */
   {4, 4, 4},                           /* cost of loading integer registers
                                           in QImode, HImode and SImode.
                                           Relative to reg-reg move (2).  */
index 134916c..eb057a6 100644 (file)
@@ -273,8 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
    move/set sequences of bytes with known size.  */
 DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
          "prefer_known_rep_movsb_stosb",
-         m_CANNONLAKE | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_TIGERLAKE
-         | m_ALDERLAKE | m_SAPPHIRERAPIDS)
+         m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
 
 /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
    compact prologues and epilogues by issuing a misaligned moves.  This
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c
new file mode 100644 (file)
index 0000000..970aa74
--- /dev/null
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mno-sse" } */
+/* { dg-final { scan-assembler "jmp\tmemcpy" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler "call\tmemcpy" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep movsb" } } */
+
+void
+foo (char *dest, char *src)
+{
+  __builtin_memcpy (dest, src, 257);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c
new file mode 100644 (file)
index 0000000..b604194
--- /dev/null
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+/* { dg-final { scan-assembler-not "jmp\tmemcpy" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not "call\tmemcpy" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep movsb" } } */
+
+typedef unsigned char e_u8;
+
+#define MAXBC 8
+
+void MixColumn(e_u8 a[4][MAXBC], e_u8 BC)
+{
+  e_u8 b[4][MAXBC];
+  int i, j;
+
+  for(i = 0; i < 4; i++)
+    for(j = 0; j < BC; j++) a[i][j] = b[i][j];
+}
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c
new file mode 100644 (file)
index 0000000..b0dc748
--- /dev/null
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mno-sse" } */
+/* { dg-final { scan-assembler "rep movsb" } } */
+
+void
+foo (char *dest, char *src)
+{
+  __builtin_memcpy (dest, src, 256);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-7.c b/gcc/testsuite/gcc.target/i386/memset-strategy-7.c
new file mode 100644 (file)
index 0000000..07c2816
--- /dev/null
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mno-sse" } */
+/* { dg-final { scan-assembler "jmp\tmemset" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler "call\tmemset" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep stosb" } } */
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 257);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-8.c b/gcc/testsuite/gcc.target/i386/memset-strategy-8.c
new file mode 100644 (file)
index 0000000..52ea882
--- /dev/null
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mno-sse" } */
+/* { dg-final { scan-assembler "rep stosb" } } */
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 256);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-9.c b/gcc/testsuite/gcc.target/i386/memset-strategy-9.c
new file mode 100644 (file)
index 0000000..d4db031
--- /dev/null
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+/* { dg-final { scan-assembler-not "jmp\tmemset" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not "call\tmemset" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep stosb" } } */
+
+typedef unsigned char e_u8;
+
+#define MAXBC 8
+
+void MixColumn(e_u8 a[4][MAXBC], e_u8 BC)
+{
+  int i, j;
+
+  for(i = 0; i < 4; i++)
+    for(j = 0; j < BC; j++) a[i][j] = 1;
+}