From bd364aaee338fbc6e3a49043614331ff471e7f4d Mon Sep 17 00:00:00 2001 From: Jan Hubicka Date: Wed, 17 Mar 2021 22:37:11 +0100 Subject: [PATCH] Enable gather on zen3 hardware. For TSVC it get used by 5 benchmarks with following runtime improvements: s4114: 1.424 -> 1.209 (84.9017%) s4115: 2.021 -> 1.065 (52.6967%) s4116: 1.549 -> 0.854 (55.1323%) s4117: 1.386 -> 1.193 (86.075%) vag: 2.741 -> 1.940 (70.7771%) there is regression in s4112: 1.115 -> 1.184 (106.188%) The internal loop is: for (int i = 0; i < LEN_1D; i++) { a[i] += b[ip[i]] * s; } (so a standard accmulate and add with indirect addressing) 40a400: c5 fe 6f 24 03 vmovdqu (%rbx,%rax,1),%ymm4 40a405: c5 fc 28 da vmovaps %ymm2,%ymm3 40a409: 48 83 c0 20 add $0x20,%rax 40a40d: c4 e2 65 92 04 a5 00 vgatherdps %ymm3,0x594100(,%ymm4,4),%ymm0 40a414: 41 59 00 40a417: c4 e2 75 a8 80 e0 34 vfmadd213ps 0x5b34e0(%rax),%ymm1,%ymm0 40a41e: 5b 00 40a420: c5 fc 29 80 e0 34 5b vmovaps %ymm0,0x5b34e0(%rax) 40a427: 00 40a428: 48 3d 00 f4 01 00 cmp $0x1f400,%rax 40a42e: 75 d0 jne 40a400 compared to: 40a280: 49 63 14 04 movslq (%r12,%rax,1),%rdx 40a284: 48 83 c0 04 add $0x4,%rax 40a288: c5 fa 10 04 95 00 41 vmovss 0x594100(,%rdx,4),%xmm0 40a28f: 59 00 40a291: c4 e2 71 a9 80 fc 34 vfmadd213ss 0x5b34fc(%rax),%xmm1,%xmm0 40a298: 5b 00 40a29a: c5 fa 11 80 fc 34 5b vmovss %xmm0,0x5b34fc(%rax) 40a2a1: 00 40a2a2: 48 3d 00 f4 01 00 cmp $0x1f400,%rax 40a2a8: 75 d6 jne 40a280 Looking at instructions latencies - fmadd is 4 cycles - vgatherdps is 39 So vgather iself is 4.8 cycle per iteration and probably CPU is able to execute rest out of order getting clos to 4 cycles per iteration (it can do 2 loads in parallel, one store and rest fits easily to execution resources). That would explain 20% slowdown. gimple internal loop is: _2 = a[i_38]; _3 = (long unsigned int) i_38; _4 = _3 * 4; _5 = ip_18 + _4; _6 = *_5; _7 = b[_6]; _8 = _7 * s_19; _9 = _2 + _8; a[i_38] = _9; i_28 = i_38 + 1; ivtmp_52 = ivtmp_53 - 1; if (ivtmp_52 != 0) goto ; [98.99%] else goto ; [1.01%] 0x25bac30 a[i_38] 1 times scalar_load costs 12 in body 0x25bac30 *_5 1 times scalar_load costs 12 in body 0x25bac30 b[_6] 1 times scalar_load costs 12 in body 0x25bac30 _7 * s_19 1 times scalar_stmt costs 12 in body 0x25bac30 _2 + _8 1 times scalar_stmt costs 12 in body 0x25bac30 _9 1 times scalar_store costs 16 in body so 19 cycles estimate of scalar load 0x2668630 a[i_38] 1 times vector_load costs 12 in body 0x2668630 *_5 1 times unaligned_load (misalign -1) costs 12 in body 0x2668630 b[_6] 8 times scalar_load costs 96 in body 0x2668630 _7 * s_19 1 times scalar_to_vec costs 4 in prologue 0x2668630 _7 * s_19 1 times vector_stmt costs 12 in body 0x2668630 _2 + _8 1 times vector_stmt costs 12 in body 0x2668630 _9 1 times vector_store costs 16 in body so 40 cycles per 8x vectorized body tsvc.c:3450:27: note: operating only on full vectors. tsvc.c:3450:27: note: Cost model analysis: Vector inside of loop cost: 160 Vector prologue cost: 4 Vector epilogue cost: 0 Scalar iteration cost: 76 Scalar outside cost: 0 Vector outside cost: 4 prologue iterations: 0 epilogue iterations: 0 Calculated minimum iters for profitability: 1 I think this generally suffers from GIGO principle. One problem seems to be that we do not know about fmadd yet and compute it as two instructions (6 cycles instead of 4). More importnat problem is that we do not account the parallelism at all. I do not see how to disable the vecotrization here without bumping gather costs noticeably off reality and thus we probably can try to experiment with this if more similar problems are found. Icc is also using gather in s1115 and s128. For s1115 the vectorization does not seem to help and s128 gets slower. Clang and aocc does not use gathers. * config/i386/x86-tune-costs.h (struct processor_costs): Update costs of gather to match reality. * config/i386/x86-tune.def (X86_TUNE_USE_GATHER): Enable for znver3. --- gcc/config/i386/x86-tune-costs.h | 10 +++++----- gcc/config/i386/x86-tune.def | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index e655e66..db03738 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -1767,11 +1767,11 @@ struct processor_costs znver3_cost = { 2, 2, 3, /* cost of moving XMM,YMM,ZMM register. */ 6, /* cost of moving SSE register to integer. */ - /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, - throughput 12. Approx 9 uops do not depend on vector size and every load - is 7 uops. */ - 18, 8, /* Gather load static, per_elt. */ - 18, 10, /* Gather store static, per_elt. */ + /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops, + throughput 9. Approx 7 uops do not depend on vector size and every load + is 4 uops. */ + 14, 8, /* Gather load static, per_elt. */ + 14, 10, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ 512, /* size of l2 cache. */ 64, /* size of prefetch block. */ diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 140ccb3..caebf76 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -436,7 +436,7 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes", /* X86_TUNE_USE_GATHER: Use gather instructions. */ DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather", - ~(m_ZNVER | m_GENERIC)) + ~(m_ZNVER1 | m_ZNVER2 | m_GENERIC)) /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or smaller FMA chain. */ -- 2.7.4