From 810b8fdff92ae8c234041234fdc1a175c3eb1ff9 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 9 Nov 2022 17:08:36 +0000 Subject: [PATCH] [X86] Replace unnecessary CVTPS2PI/CVTPS2DQ overrides with better base class defs Broadwell/Haswell were completely overriding the WriteCvtPD2I class defs - we can remove those overrides entirely by just choosing better class defs. Also fixes the scheduler for a missing YMM folded case - confirmed with Agner + uops.info that the port usage is correct --- llvm/lib/Target/X86/X86SchedBroadwell.td | 27 +++---------------- llvm/lib/Target/X86/X86SchedHaswell.td | 31 ++++------------------ .../tools/llvm-mca/X86/Broadwell/resources-avx1.s | 10 +++---- .../tools/llvm-mca/X86/Haswell/resources-avx1.s | 10 +++---- 4 files changed, 19 insertions(+), 59 deletions(-) diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td index c6bc775..7156c2e 100644 --- a/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -362,8 +362,8 @@ defm : BWWriteResPair; defm : BWWriteResPair; defm : X86WriteResPairUnsupported; defm : BWWriteResPair; -defm : BWWriteResPair; -defm : BWWriteResPair; +defm : BWWriteResPair; +defm : BWWriteResPair; defm : X86WriteResPairUnsupported; defm : BWWriteResPair; @@ -851,12 +851,10 @@ def BWWriteResGroup42 : SchedWriteRes<[BWPort1,BWPort5]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVT(T?)PD2PIrr", - "MMX_CVT(T?)PS2PIrr", +def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVT(T?)PS2PIrr", "(V?)CVTSI642SDrr", "(V?)CVTSI2SDrr", - "(V?)CVTSI2SSrr", - "(V?)CVT(T?)PD2DQrr")>; + "(V?)CVTSI2SSrr")>; def BWWriteResGroup43 : SchedWriteRes<[BWPort0,BWPort4,BWPort237]> { let Latency = 4; @@ -968,14 +966,6 @@ def: InstRW<[BWWriteResGroup59], (instrs CVTPS2PDrm, VCVTPS2PDrm, VPSLLVQrm, VPSRLVQrm)>; -def BWWriteResGroup60 : SchedWriteRes<[BWPort1,BWPort5]> { - let Latency = 6; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[BWWriteResGroup60], (instrs VCVTPD2DQYrr, - VCVTTPD2DQYrr)>; - def BWWriteResGroup62 : SchedWriteRes<[BWPort6,BWPort23]> { let Latency = 6; let NumMicroOps = 2; @@ -1188,15 +1178,6 @@ def: InstRW<[BWWriteResGroup101], (instregex "(ADD|SUB|SUBR)_F(32|64)m", def: InstRW<[BWWriteResGroup101], (instrs VCVTPS2DQYrm, VCVTTPS2DQYrm)>; -def BWWriteResGroup107 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> { - let Latency = 9; - let NumMicroOps = 3; - let ResourceCycles = [1,1,1]; -} -def: InstRW<[BWWriteResGroup107], (instrs CVTPD2DQrm, VCVTPD2DQrm, - CVTTPD2DQrm, VCVTTPD2DQrm)>; -def: InstRW<[BWWriteResGroup107], (instregex "MMX_CVT(T?)PD2PIrm")>; - def BWWriteResGroup108 : SchedWriteRes<[BWPort5,BWPort23,BWPort015]> { let Latency = 9; let NumMicroOps = 3; diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td index 0cd007c..7c5804f 100644 --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -356,9 +356,9 @@ defm : HWWriteResPair; // Unsupported // Conversion between integer and float. defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; -defm : HWWriteResPair; // Unsupported = 1 +defm : HWWriteResPair; +defm : HWWriteResPair; +defm : HWWriteResPair; // Unsupported = 1 defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; @@ -1354,13 +1354,10 @@ def HWWriteResGroup73 : SchedWriteRes<[HWPort1,HWPort5]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[HWWriteResGroup73], (instrs MMX_CVTPD2PIrr, - MMX_CVTPS2PIrr, - MMX_CVTTPD2PIrr, +def: InstRW<[HWWriteResGroup73], (instrs MMX_CVTPS2PIrr, MMX_CVTTPS2PIrr)>; def: InstRW<[HWWriteResGroup73], (instregex "(V?)CVTSI(64)?2SDrr", - "(V?)CVTSI2SSrr", - "(V?)CVT(T?)PD2DQrr")>; + "(V?)CVTSI2SSrr")>; def HWWriteResGroup75 : SchedWriteRes<[HWPort1,HWPort23]> { let Latency = 11; @@ -1369,16 +1366,6 @@ def HWWriteResGroup75 : SchedWriteRes<[HWPort1,HWPort23]> { } def: InstRW<[HWWriteResGroup75], (instregex "FICOM(P?)(16|32)m")>; -def HWWriteResGroup78 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> { - let Latency = 10; - let NumMicroOps = 3; - let ResourceCycles = [1,1,1]; -} -def: InstRW<[HWWriteResGroup78], (instrs CVTPD2DQrm, VCVTPD2DQrm, - CVTTPD2DQrm, VCVTTPD2DQrm, - MMX_CVTPD2PIrm, - MMX_CVTTPD2PIrm)>; - def HWWriteResGroup78_1 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> { let Latency = 9; let NumMicroOps = 3; @@ -1479,14 +1466,6 @@ def HWWriteResGroup100 : SchedWriteRes<[HWPort06,HWPort0156]> { } def: InstRW<[HWWriteResGroup100], (instrs XSETBV)>; -def HWWriteResGroup102 : SchedWriteRes<[HWPort1,HWPort5]> { - let Latency = 6; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[HWWriteResGroup102], (instrs VCVTPD2DQYrr, - VCVTTPD2DQYrr)>; - def HWWriteResGroup103 : SchedWriteRes<[HWPort1,HWPort23]> { let Latency = 13; let NumMicroOps = 3; diff --git a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s index 27c6120..c33cc79 100644 --- a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s +++ b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s @@ -1125,7 +1125,7 @@ vzeroupper # CHECK-NEXT: 2 4 1.00 vcvtpd2dq %xmm0, %xmm2 # CHECK-NEXT: 3 9 1.00 * vcvtpd2dqx (%rax), %xmm2 # CHECK-NEXT: 2 6 1.00 vcvtpd2dq %ymm0, %xmm2 -# CHECK-NEXT: 2 8 1.00 * vcvtpd2dqy (%rax), %xmm2 +# CHECK-NEXT: 3 12 1.00 * vcvtpd2dqy (%rax), %xmm2 # CHECK-NEXT: 2 4 1.00 vcvtpd2ps %xmm0, %xmm2 # CHECK-NEXT: 3 9 1.00 * vcvtpd2psx (%rax), %xmm2 # CHECK-NEXT: 2 6 1.00 vcvtpd2ps %ymm0, %xmm2 @@ -1161,7 +1161,7 @@ vzeroupper # CHECK-NEXT: 2 4 1.00 vcvttpd2dq %xmm0, %xmm2 # CHECK-NEXT: 3 9 1.00 * vcvttpd2dqx (%rax), %xmm2 # CHECK-NEXT: 2 6 1.00 vcvttpd2dq %ymm0, %xmm2 -# CHECK-NEXT: 2 8 1.00 * vcvttpd2dqy (%rax), %xmm2 +# CHECK-NEXT: 3 12 1.00 * vcvttpd2dqy (%rax), %xmm2 # CHECK-NEXT: 1 3 1.00 vcvttps2dq %xmm0, %xmm2 # CHECK-NEXT: 2 8 1.00 * vcvttps2dq (%rax), %xmm2 # CHECK-NEXT: 1 3 1.00 vcvttps2dq %ymm0, %ymm2 @@ -1736,7 +1736,7 @@ vzeroupper # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] -# CHECK-NEXT: - 257.00 215.25 235.25 176.17 176.17 38.00 430.25 2.25 12.67 +# CHECK-NEXT: - 257.00 215.25 235.25 176.17 176.17 38.00 432.25 2.25 12.67 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: @@ -1835,7 +1835,7 @@ vzeroupper # CHECK-NEXT: - - - 1.00 - - - 1.00 - - vcvtpd2dq %xmm0, %xmm2 # CHECK-NEXT: - - - 1.00 0.50 0.50 - 1.00 - - vcvtpd2dqx (%rax), %xmm2 # CHECK-NEXT: - - - 1.00 - - - 1.00 - - vcvtpd2dq %ymm0, %xmm2 -# CHECK-NEXT: - - - 1.00 0.50 0.50 - - - - vcvtpd2dqy (%rax), %xmm2 +# CHECK-NEXT: - - - 1.00 0.50 0.50 - 1.00 - - vcvtpd2dqy (%rax), %xmm2 # CHECK-NEXT: - - - 1.00 - - - 1.00 - - vcvtpd2ps %xmm0, %xmm2 # CHECK-NEXT: - - - 1.00 0.50 0.50 - 1.00 - - vcvtpd2psx (%rax), %xmm2 # CHECK-NEXT: - - - 1.00 - - - 1.00 - - vcvtpd2ps %ymm0, %xmm2 @@ -1871,7 +1871,7 @@ vzeroupper # CHECK-NEXT: - - - 1.00 - - - 1.00 - - vcvttpd2dq %xmm0, %xmm2 # CHECK-NEXT: - - - 1.00 0.50 0.50 - 1.00 - - vcvttpd2dqx (%rax), %xmm2 # CHECK-NEXT: - - - 1.00 - - - 1.00 - - vcvttpd2dq %ymm0, %xmm2 -# CHECK-NEXT: - - - 1.00 0.50 0.50 - - - - vcvttpd2dqy (%rax), %xmm2 +# CHECK-NEXT: - - - 1.00 0.50 0.50 - 1.00 - - vcvttpd2dqy (%rax), %xmm2 # CHECK-NEXT: - - - 1.00 - - - - - - vcvttps2dq %xmm0, %xmm2 # CHECK-NEXT: - - - 1.00 0.50 0.50 - - - - vcvttps2dq (%rax), %xmm2 # CHECK-NEXT: - - - 1.00 - - - - - - vcvttps2dq %ymm0, %ymm2 diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s index ea7d251..3da547d 100644 --- a/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s +++ b/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s @@ -1125,7 +1125,7 @@ vzeroupper # CHECK-NEXT: 2 4 1.00 vcvtpd2dq %xmm0, %xmm2 # CHECK-NEXT: 3 10 1.00 * vcvtpd2dqx (%rax), %xmm2 # CHECK-NEXT: 2 6 1.00 vcvtpd2dq %ymm0, %xmm2 -# CHECK-NEXT: 2 8 1.00 * vcvtpd2dqy (%rax), %xmm2 +# CHECK-NEXT: 3 12 1.00 * vcvtpd2dqy (%rax), %xmm2 # CHECK-NEXT: 2 4 1.00 vcvtpd2ps %xmm0, %xmm2 # CHECK-NEXT: 3 10 1.00 * vcvtpd2psx (%rax), %xmm2 # CHECK-NEXT: 2 6 1.00 vcvtpd2ps %ymm0, %xmm2 @@ -1161,7 +1161,7 @@ vzeroupper # CHECK-NEXT: 2 4 1.00 vcvttpd2dq %xmm0, %xmm2 # CHECK-NEXT: 3 10 1.00 * vcvttpd2dqx (%rax), %xmm2 # CHECK-NEXT: 2 6 1.00 vcvttpd2dq %ymm0, %xmm2 -# CHECK-NEXT: 2 8 1.00 * vcvttpd2dqy (%rax), %xmm2 +# CHECK-NEXT: 3 12 1.00 * vcvttpd2dqy (%rax), %xmm2 # CHECK-NEXT: 1 3 1.00 vcvttps2dq %xmm0, %xmm2 # CHECK-NEXT: 2 9 1.00 * vcvttps2dq (%rax), %xmm2 # CHECK-NEXT: 1 3 1.00 vcvttps2dq %ymm0, %ymm2 @@ -1736,7 +1736,7 @@ vzeroupper # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] -# CHECK-NEXT: - 336.00 214.58 236.58 176.17 176.17 38.00 433.58 2.25 12.67 +# CHECK-NEXT: - 336.00 214.58 236.58 176.17 176.17 38.00 435.58 2.25 12.67 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: @@ -1835,7 +1835,7 @@ vzeroupper # CHECK-NEXT: - - - 1.00 - - - 1.00 - - vcvtpd2dq %xmm0, %xmm2 # CHECK-NEXT: - - - 1.00 0.50 0.50 - 1.00 - - vcvtpd2dqx (%rax), %xmm2 # CHECK-NEXT: - - - 1.00 - - - 1.00 - - vcvtpd2dq %ymm0, %xmm2 -# CHECK-NEXT: - - - 1.00 0.50 0.50 - - - - vcvtpd2dqy (%rax), %xmm2 +# CHECK-NEXT: - - - 1.00 0.50 0.50 - 1.00 - - vcvtpd2dqy (%rax), %xmm2 # CHECK-NEXT: - - - 1.00 - - - 1.00 - - vcvtpd2ps %xmm0, %xmm2 # CHECK-NEXT: - - - 1.00 0.50 0.50 - 1.00 - - vcvtpd2psx (%rax), %xmm2 # CHECK-NEXT: - - - 1.00 - - - 1.00 - - vcvtpd2ps %ymm0, %xmm2 @@ -1871,7 +1871,7 @@ vzeroupper # CHECK-NEXT: - - - 1.00 - - - 1.00 - - vcvttpd2dq %xmm0, %xmm2 # CHECK-NEXT: - - - 1.00 0.50 0.50 - 1.00 - - vcvttpd2dqx (%rax), %xmm2 # CHECK-NEXT: - - - 1.00 - - - 1.00 - - vcvttpd2dq %ymm0, %xmm2 -# CHECK-NEXT: - - - 1.00 0.50 0.50 - - - - vcvttpd2dqy (%rax), %xmm2 +# CHECK-NEXT: - - - 1.00 0.50 0.50 - 1.00 - - vcvttpd2dqy (%rax), %xmm2 # CHECK-NEXT: - - - 1.00 - - - - - - vcvttps2dq %xmm0, %xmm2 # CHECK-NEXT: - - - 1.00 0.50 0.50 - - - - vcvttps2dq (%rax), %xmm2 # CHECK-NEXT: - - - 1.00 - - - - - - vcvttps2dq %ymm0, %ymm2 -- 2.7.4