i386: correct x87&SSE division modeling in znver.md
authorAlexander Monakov <amonakov@ispras.ru>
Tue, 1 Nov 2022 14:04:25 +0000 (17:04 +0300)
committerAlexander Monakov <amonakov@ispras.ru>
Wed, 16 Nov 2022 13:41:39 +0000 (16:41 +0300)
Correct modeling of division instructions in the SIMD/FP domain for
AMD Zen architectures and avoid combinatorial explosion of automaton
tables by modeling the separate floating-point division unit and
correcting reservations to reflect reciprocal throughput of the
corresponding instructions, similar to earlier commit
5cee5f94000 ("i386: correct integer division modeling in znver.md").

Division is partially pipelined and some instructions have fractional
throughput (e.g. Zen 3 can issue divss and divsd each 3.5 and 4.5
cycles on average, respectively). Considering these CPUs implement
out-of-order execution, the model doesn't need to be exact to the last
cycle, so simplify it by using 4/5 cycles for SF/DF modes, and not
modeling the fact that FP3 pipe is occupied for one cycle.

Top znver table sizes in insn-automata.o:

Before:

428108 r znver1_fp_min_issue_delay
856216 r znver1_fp_transitions

After:

30056 r znver1_fp_min_issue_delay
120224 r znver1_fp_transitions

gcc/ChangeLog:

PR target/87832
* config/i386/znver.md (znver1_fdiv): New automaton.
(znver1-fdiv): New unit.
(znver1_fp_op_div): Correct unit and cycles in the reservation.
(znver1_fp_op_div_load): Ditto.
(znver1_fp_op_idiv_load): Ditto.
(znver2_fp_op_idiv_load): Ditto.
(znver1_ssediv_ss_ps): Ditto.
(znver1_ssediv_ss_ps_load): Ditto.
(znver1_ssediv_sd_pd): Ditto.
(znver1_ssediv_sd_pd_load): Ditto.
(znver1_ssediv_avx256_ps): Ditto.
(znver1_ssediv_avx256_ps_load): Ditto.
(znver1_ssediv_avx256_pd): Ditto.
(znver1_ssediv_avx256_pd_load): Ditto.

gcc/config/i386/znver.md

index 4aa098f..c52f8b5 100644 (file)
@@ -24,7 +24,7 @@
 ;; AMD znver1, znver2 and znver3 Scheduling
 ;; Modeling automatons for zen decoders, integer execution pipes,
 ;; SIMD/FP domain, AGU pipes, and dividers.
-(define_automaton "znver1, znver1_ieu, znver1_fp, znver1_agu, znver1_idiv")
+(define_automaton "znver1, znver1_ieu, znver1_fp, znver1_agu, znver1_idiv, znver1_fdiv")
 
 ;; Decoders unit has 4 decoders and all of them can decode fast path
 ;; and vector type instructions.
@@ -95,6 +95,7 @@
 
 ;; Dividers
 (define_cpu_unit "znver1-idiv" "znver1_idiv")
+(define_cpu_unit "znver1-fdiv" "znver1_fdiv")
 
 ;; Call instruction
 (define_insn_reservation "znver1_call" 1
                         (and (eq_attr "cpu" "znver1,znver2,znver3")
                              (and (eq_attr "type" "fdiv")
                                   (eq_attr "memory" "none")))
-                        "znver1-direct,znver1-fp3*15")
+                        "znver1-direct,znver1-fdiv*6")
 
 (define_insn_reservation "znver1_fp_op_div_load" 22
                         (and (eq_attr "cpu" "znver1,znver2,znver3")
                              (and (eq_attr "type" "fdiv")
                                   (eq_attr "memory" "load")))
-                        "znver1-direct,znver1-load,znver1-fp3*15")
+                        "znver1-direct,znver1-load,znver1-fdiv*6")
 
 (define_insn_reservation "znver1_fp_op_idiv_load" 27
                         (and (eq_attr "cpu" "znver1")
                              (and (eq_attr "type" "fdiv")
                                   (and (eq_attr "fp_int_src" "true")
                                        (eq_attr "memory" "load"))))
-                        "znver1-double,znver1-load,znver1-fp3*19")
+                        "znver1-double,znver1-load,znver1-fdiv*6")
 
 (define_insn_reservation "znver2_fp_op_idiv_load" 26
                         (and (eq_attr "cpu" "znver2,znver3")
                              (and (eq_attr "type" "fdiv")
                                   (and (eq_attr "fp_int_src" "true")
                                        (eq_attr "memory" "load"))))
-                        "znver1-double,znver1-load,znver1-fp3*19")
+                        "znver1-double,znver1-load,znver1-fdiv*6")
 
 
 ;; MMX, SSE, SSEn.n, AVX, AVX2 instructions
                                              (eq_attr "mode" "V8SF,V4SF,SF")))
                              (and (eq_attr "type" "ssediv")
                                   (eq_attr "memory" "none")))
-                        "znver1-direct,znver1-fp3*10")
+                        "znver1-direct,znver1-fdiv*4")
 
 (define_insn_reservation "znver1_ssediv_ss_ps_load" 17
                         (and (ior (and (eq_attr "cpu" "znver1")
                                              (eq_attr "mode" "V8SF,V4SF,SF")))
                              (and (eq_attr "type" "ssediv")
                                   (eq_attr "memory" "load")))
-                        "znver1-direct,znver1-load,znver1-fp3*10")
+                        "znver1-direct,znver1-load,znver1-fdiv*4")
 
 (define_insn_reservation "znver1_ssediv_sd_pd" 13
                         (and (ior (and (eq_attr "cpu" "znver1")
                                              (eq_attr "mode" "V4DF,V2DF,DF")))
                              (and (eq_attr "type" "ssediv")
                                   (eq_attr "memory" "none")))
-                        "znver1-direct,znver1-fp3*13")
+                        "znver1-direct,znver1-fdiv*5")
 
 (define_insn_reservation "znver1_ssediv_sd_pd_load" 20
                         (and (ior (and (eq_attr "cpu" "znver1")
                                              (eq_attr "mode" "V4DF,V2DF,DF")))
                              (and (eq_attr "type" "ssediv")
                                   (eq_attr "memory" "load")))
-                        "znver1-direct,znver1-load,znver1-fp3*13")
+                        "znver1-direct,znver1-load,znver1-fdiv*5")
 
 (define_insn_reservation "znver1_ssediv_avx256_ps" 12
                         (and (eq_attr "cpu" "znver1")
                              (and (eq_attr "mode" "V8SF")
                                   (and (eq_attr "memory" "none")
                                        (eq_attr "type" "ssediv"))))
-                        "znver1-double,znver1-fp3*12")
+                        "znver1-double,znver1-fdiv*8")
 
 (define_insn_reservation "znver1_ssediv_avx256_ps_load" 19
                         (and (eq_attr "cpu" "znver1")
                              (and (eq_attr "mode" "V8SF")
                                   (and (eq_attr "type" "ssediv")
                                        (eq_attr "memory" "load"))))
-                        "znver1-double,znver1-load,znver1-fp3*12")
+                        "znver1-double,znver1-load,znver1-fdiv*8")
 
 (define_insn_reservation "znver1_ssediv_avx256_pd" 15
                         (and (eq_attr "cpu" "znver1")
                              (and (eq_attr "mode" "V4DF")
                                   (and (eq_attr "type" "ssediv")
                                        (eq_attr "memory" "none"))))
-                        "znver1-double,znver1-fp3*15")
+                        "znver1-double,znver1-fdiv*10")
 
 (define_insn_reservation "znver1_ssediv_avx256_pd_load" 22 
                         (and (eq_attr "cpu" "znver1")
                              (and (eq_attr "mode" "V4DF")
                                   (and (eq_attr "type" "ssediv")
                                        (eq_attr "memory" "load"))))
-                        "znver1-double,znver1-load,znver1-fp3*15")
+                        "znver1-double,znver1-load,znver1-fdiv*10")
 ;; SSE MUL
 (define_insn_reservation "znver1_ssemul_ss_ps" 3
                         (and (ior (and (eq_attr "cpu" "znver1")