From bd364aaee338fbc6e3a49043614331ff471e7f4d Mon Sep 17 00:00:00 2001
From: Jan Hubicka <jh@suse.cz>
Date: Wed, 17 Mar 2021 22:37:11 +0100
Subject: [PATCH] Enable gather on zen3 hardware.

For TSVC it get used by 5 benchmarks with following runtime improvements:

s4114: 1.424 -> 1.209  (84.9017%)
s4115: 2.021 -> 1.065  (52.6967%)
s4116: 1.549 -> 0.854  (55.1323%)
s4117: 1.386 -> 1.193  (86.075%)
vag: 2.741 -> 1.940  (70.7771%)

there is regression in

s4112: 1.115 -> 1.184  (106.188%)

The internal loop is:

        for (int i = 0; i < LEN_1D; i++) {
            a[i] += b[ip[i]] * s;
        }

(so a standard accmulate and add with indirect addressing)

  40a400:       c5 fe 6f 24 03          vmovdqu (%rbx,%rax,1),%ymm4
  40a405:       c5 fc 28 da             vmovaps %ymm2,%ymm3
  40a409:       48 83 c0 20             add    $0x20,%rax
  40a40d:       c4 e2 65 92 04 a5 00    vgatherdps %ymm3,0x594100(,%ymm4,4),%ymm0
  40a414:       41 59 00
  40a417:       c4 e2 75 a8 80 e0 34    vfmadd213ps 0x5b34e0(%rax),%ymm1,%ymm0
  40a41e:       5b 00
  40a420:       c5 fc 29 80 e0 34 5b    vmovaps %ymm0,0x5b34e0(%rax)
  40a427:       00
  40a428:       48 3d 00 f4 01 00       cmp    $0x1f400,%rax
  40a42e:       75 d0                   jne    40a400 <s4112+0x60>

compared to:

  40a280:       49 63 14 04             movslq (%r12,%rax,1),%rdx
  40a284:       48 83 c0 04             add    $0x4,%rax
  40a288:       c5 fa 10 04 95 00 41    vmovss 0x594100(,%rdx,4),%xmm0
  40a28f:       59 00
  40a291:       c4 e2 71 a9 80 fc 34    vfmadd213ss 0x5b34fc(%rax),%xmm1,%xmm0
  40a298:       5b 00
  40a29a:       c5 fa 11 80 fc 34 5b    vmovss %xmm0,0x5b34fc(%rax)
  40a2a1:       00
  40a2a2:       48 3d 00 f4 01 00       cmp    $0x1f400,%rax
  40a2a8:       75 d6                   jne    40a280 <s4112+0x40>

Looking at instructions latencies

 - fmadd is 4 cycles
 - vgatherdps is 39

So vgather iself is 4.8 cycle per iteration and probably CPU is able to execute
rest out of order getting clos to 4 cycles per iteration (it can do 2 loads in
parallel, one store and rest fits easily to execution resources). That would
explain 20% slowdown.

gimple internal loop is:
  _2 = a[i_38];
  _3 = (long unsigned int) i_38;
  _4 = _3 * 4;
  _5 = ip_18 + _4;
  _6 = *_5;
  _7 = b[_6];
  _8 = _7 * s_19;
  _9 = _2 + _8;
  a[i_38] = _9;
  i_28 = i_38 + 1;
  ivtmp_52 = ivtmp_53 - 1;
  if (ivtmp_52 != 0)
    goto <bb 8>; [98.99%]
  else
    goto <bb 4>; [1.01%]

0x25bac30 a[i_38] 1 times scalar_load costs 12 in body
0x25bac30 *_5 1 times scalar_load costs 12 in body
0x25bac30 b[_6] 1 times scalar_load costs 12 in body
0x25bac30 _7 * s_19 1 times scalar_stmt costs 12 in body
0x25bac30 _2 + _8 1 times scalar_stmt costs 12 in body
0x25bac30 _9 1 times scalar_store costs 16 in body

so 19 cycles estimate of scalar load

0x2668630 a[i_38] 1 times vector_load costs 12 in body
0x2668630 *_5 1 times unaligned_load (misalign -1) costs 12 in body
0x2668630 b[_6] 8 times scalar_load costs 96 in body
0x2668630 _7 * s_19 1 times scalar_to_vec costs 4 in prologue
0x2668630 _7 * s_19 1 times vector_stmt costs 12 in body
0x2668630 _2 + _8 1 times vector_stmt costs 12 in body
0x2668630 _9 1 times vector_store costs 16 in body

so 40 cycles per 8x vectorized body

tsvc.c:3450:27: note:  operating only on full vectors.
tsvc.c:3450:27: note:  Cost model analysis:
  Vector inside of loop cost: 160
  Vector prologue cost: 4
  Vector epilogue cost: 0
  Scalar iteration cost: 76
  Scalar outside cost: 0
  Vector outside cost: 4
  prologue iterations: 0
  epilogue iterations: 0
  Calculated minimum iters for profitability: 1

I think this generally suffers from GIGO principle.
One problem seems to be that we do not know about fmadd yet and compute it as
two instructions (6 cycles instead of 4). More importnat problem is that we do
not account the parallelism at all.  I do not see how to disable the
vecotrization here without bumping gather costs noticeably off reality and thus
we probably can try to experiment with this if more similar problems are found.

Icc is also using gather in s1115 and s128.
For s1115 the vectorization does not seem to help and s128 gets slower.

Clang and aocc does not use gathers.

	* config/i386/x86-tune-costs.h (struct processor_costs): Update costs
	of gather to match reality.
	* config/i386/x86-tune.def (X86_TUNE_USE_GATHER): Enable for znver3.
---
 gcc/config/i386/x86-tune-costs.h | 10 +++++-----
 gcc/config/i386/x86-tune.def     |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index e655e66..db03738 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1767,11 +1767,11 @@ struct processor_costs znver3_cost = {
   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
 					   register.  */
   6,					/* cost of moving SSE register to integer.  */
-  /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
-     throughput 12.  Approx 9 uops do not depend on vector size and every load
-     is 7 uops.  */
-  18, 8,				/* Gather load static, per_elt.  */
-  18, 10,				/* Gather store static, per_elt.  */
+  /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
+     throughput 9.  Approx 7 uops do not depend on vector size and every load
+     is 4 uops.  */
+  14, 8,				/* Gather load static, per_elt.  */
+  14, 10,				/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
   512,					/* size of l2 cache.  */
   64,					/* size of prefetch block.  */
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 140ccb3..caebf76 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -436,7 +436,7 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
 
 /* X86_TUNE_USE_GATHER: Use gather instructions.  */
 DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
-	  ~(m_ZNVER | m_GENERIC))
+	  ~(m_ZNVER1 | m_ZNVER2 | m_GENERIC))
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
    smaller FMA chain.  */
-- 
2.7.4