From 5b32a1817d9aa042698350b04496db6f4fce62d4 Mon Sep 17 00:00:00 2001 From: Jan Hubicka Date: Mon, 15 Mar 2021 11:36:52 +0100 Subject: [PATCH] znver3 tuning part 1 2021-03-15 Jan Hubicka * config/i386/i386-options.c (processor_cost_table): Add znver3_cost. * config/i386/x86-tune-costs.h (znver3_cost): New gobal variable; copy of znver2_cost. --- gcc/config/i386/i386-options.c | 2 +- gcc/config/i386/x86-tune-costs.h | 134 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+), 1 deletion(-) diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index e93935f..7865bc1 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -743,7 +743,7 @@ static const struct processor_costs *processor_cost_table[] = &btver2_cost, &znver1_cost, &znver2_cost, - &znver2_cost + &znver3_cost }; /* Guarantee that the array is aligned with enum processor_type. */ diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index cc27c79..e655e66 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -1688,6 +1688,140 @@ struct processor_costs znver2_cost = { "16", /* Func alignment. */ }; +struct processor_costs znver3_cost = { + { + /* Start of register allocator costs. integer->integer move cost is 2. */ + + /* reg-reg moves are done by renaming and thus they are even cheaper than + 1 cycle. Because reg-reg move cost is 2 and following tables correspond + to doubles of latencies, we do not model this correctly. It does not + seem to make practical difference to bump prices up even more. */ + 6, /* cost for loading QImode using + movzbl. */ + {6, 6, 6}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {8, 8, 8}, /* cost of storing integer + registers. */ + 2, /* cost of reg,reg fld/fst. */ + {6, 6, 16}, /* cost of loading fp registers + in SFmode, DFmode and XFmode. */ + {8, 8, 16}, /* cost of storing fp registers + in SFmode, DFmode and XFmode. */ + 2, /* cost of moving MMX register. */ + {6, 6}, /* cost of loading MMX registers + in SImode and DImode. */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode. */ + 2, 2, 3, /* cost of moving XMM,YMM,ZMM + register. */ + {6, 6, 6, 6, 12}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit. */ + {8, 8, 8, 8, 16}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit. */ + 6, 6, /* SSE->integer and integer->SSE + moves. */ + 8, 8, /* mask->integer and integer->mask moves */ + {6, 6, 6}, /* cost of loading mask register + in QImode, HImode, SImode. */ + {8, 8, 8}, /* cost if storing mask register + in QImode, HImode, SImode. */ + 2, /* cost of moving mask register. */ + /* End of register allocator costs. */ + }, + + COSTS_N_INSNS (1), /* cost of an add instruction. */ + COSTS_N_INSNS (1), /* cost of a lea instruction. */ + COSTS_N_INSNS (1), /* variable shift costs. */ + COSTS_N_INSNS (1), /* constant shift costs. */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ + COSTS_N_INSNS (3), /* HI. */ + COSTS_N_INSNS (3), /* SI. */ + COSTS_N_INSNS (3), /* DI. */ + COSTS_N_INSNS (3)}, /* other. */ + 0, /* cost of multiply per each bit + set. */ + /* Depending on parameters, idiv can get faster on ryzen. This is upper + bound. */ + {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */ + COSTS_N_INSNS (22), /* HI. */ + COSTS_N_INSNS (30), /* SI. */ + COSTS_N_INSNS (45), /* DI. */ + COSTS_N_INSNS (45)}, /* other. */ + COSTS_N_INSNS (1), /* cost of movsx. */ + COSTS_N_INSNS (1), /* cost of movzx. */ + 8, /* "large" insn. */ + 9, /* MOVE_RATIO. */ + 6, /* CLEAR_RATIO */ + {6, 6, 6}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {8, 8, 8}, /* cost of storing integer + registers. */ + {6, 6, 6, 6, 12}, /* cost of loading SSE registers + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {8, 8, 8, 8, 16}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ + {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ + 2, 2, 3, /* cost of moving XMM,YMM,ZMM + register. */ + 6, /* cost of moving SSE register to integer. */ + /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, + throughput 12. Approx 9 uops do not depend on vector size and every load + is 7 uops. */ + 18, 8, /* Gather load static, per_elt. */ + 18, 10, /* Gather store static, per_elt. */ + 32, /* size of l1 cache. */ + 512, /* size of l2 cache. */ + 64, /* size of prefetch block. */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches. */ + 3, /* Branch cost. */ + COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (5), /* cost of FMUL instruction. */ + /* Latency of fdiv is 8-15. */ + COSTS_N_INSNS (15), /* cost of FDIV instruction. */ + COSTS_N_INSNS (1), /* cost of FABS instruction. */ + COSTS_N_INSNS (1), /* cost of FCHS instruction. */ + /* Latency of fsqrt is 4-10. */ + COSTS_N_INSNS (10), /* cost of FSQRT instruction. */ + + COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ + COSTS_N_INSNS (3), /* cost of MULSS instruction. */ + COSTS_N_INSNS (3), /* cost of MULSD instruction. */ + COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ + COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ + COSTS_N_INSNS (10), /* cost of DIVSS instruction. */ + /* 9-13. */ + COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ + COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */ + COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */ + /* Zen can execute 4 integer operations per cycle. FP operations + take 3 cycles and it can execute 2 integer additions and 2 + multiplications thus reassociation may make sense up to with of 6. + SPEC2k6 bencharks suggests + that 4 works better than 6 probably due to register pressure. + + Integer vector operations are taken by FP unit and execute 3 vector + plus/minus operations per cycle but only one multiply. This is adjusted + in ix86_reassociation_width. */ + 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ + znver2_memcpy, + znver2_memset, + COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ + COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ + "16", /* Loop alignment. */ + "16", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ +}; + /* skylake_cost should produce code tuned for Skylake familly of CPUs. */ static stringop_algs skylake_memcpy[2] = { {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, -- 2.7.4