From eea6a2782e852ee38a56af8245a27d864b56b592 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 29 Oct 2022 12:03:38 +0100 Subject: [PATCH] [X86] WriteFShuffle256 shuffles aren't microcoded in the llvm sense znver1/2 might have poor throughput for crosslane shuffles but they don't consume 100 cycles of resources I think there was a misunderstanding between the AMD definition of microcoding (more than 2-3 uops) and LLVM (here be dragons - impossible to approximately model the instruction) This is more yak shaving to come from D103695 - this time working out why codegen involving broadcasts gives such weird numbers --- llvm/lib/Target/X86/X86ScheduleZnver1.td | 7 ++----- llvm/lib/Target/X86/X86ScheduleZnver2.td | 7 ++----- llvm/test/tools/llvm-mca/X86/Znver1/resources-avx2.s | 12 ++++++------ llvm/test/tools/llvm-mca/X86/Znver2/resources-avx2.s | 12 ++++++------ 4 files changed, 16 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td index 79beb3e..01deab3 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -372,6 +372,8 @@ defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : X86WriteResPairUnsupported; defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; // Vector integer operations which uses FPU units defm : X86WriteRes; @@ -479,11 +481,6 @@ defm : ZnWriteResFpuPair; def : WriteRes; def : WriteRes; -// Following instructions with latency=100 are microcoded. -// We set long latency so as to block the entire pipeline. -defm : ZnWriteResFpuPair; -defm : ZnWriteResFpuPair; - // Microcoded Instructions def ZnWriteMicrocoded : SchedWriteRes<[]> { let Latency = 100; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td index b4f72a9..788b71e 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver2.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td @@ -371,6 +371,8 @@ defm : Zn2WriteResFpuPair; defm : Zn2WriteResFpuPair; defm : X86WriteResPairUnsupported; defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; +defm : Zn2WriteResFpuPair; // Vector integer operations which uses FPU units defm : X86WriteRes; @@ -478,11 +480,6 @@ defm : Zn2WriteResFpuPair; def : WriteRes; def : WriteRes; -// Following instructions with latency=100 are microcoded. -// We set long latency so as to block the entire pipeline. -defm : Zn2WriteResFpuPair; -defm : Zn2WriteResFpuPair; - // Microcoded Instructions def Zn2WriteMicrocoded : SchedWriteRes<[]> { let Latency = 100; diff --git a/llvm/test/tools/llvm-mca/X86/Znver1/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/Znver1/resources-avx2.s index 0648c31..179e6a4 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver1/resources-avx2.s +++ b/llvm/test/tools/llvm-mca/X86/Znver1/resources-avx2.s @@ -461,8 +461,8 @@ vpxor (%rax), %ymm1, %ymm2 # CHECK: [1] [2] [3] [4] [5] [6] Instructions: # CHECK-NEXT: 1 8 0.50 * vbroadcasti128 (%rax), %ymm0 -# CHECK-NEXT: 1 100 0.25 vbroadcastsd %xmm0, %ymm0 -# CHECK-NEXT: 1 100 0.25 vbroadcastss %xmm0, %ymm0 +# CHECK-NEXT: 1 2 0.25 vbroadcastsd %xmm0, %ymm0 +# CHECK-NEXT: 1 2 0.25 vbroadcastss %xmm0, %ymm0 # CHECK-NEXT: 1 2 0.25 vextracti128 $1, %ymm0, %xmm2 # CHECK-NEXT: 1 1 0.50 * vextracti128 $1, %ymm0, (%rax) # CHECK-NEXT: 1 100 0.25 * vgatherdpd %xmm0, (%rax,%xmm1,2), %xmm2 @@ -562,10 +562,10 @@ vpxor (%rax), %ymm1, %ymm2 # CHECK-NEXT: 1 9 0.50 * vperm2i128 $1, (%rax), %ymm1, %ymm2 # CHECK-NEXT: 1 2 0.25 vpermd %ymm0, %ymm1, %ymm2 # CHECK-NEXT: 1 9 0.50 * vpermd (%rax), %ymm1, %ymm2 -# CHECK-NEXT: 1 100 0.25 vpermpd $1, %ymm0, %ymm2 -# CHECK-NEXT: 1 107 0.50 * vpermpd $1, (%rax), %ymm2 -# CHECK-NEXT: 1 100 0.25 vpermps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 1 107 0.50 * vpermps (%rax), %ymm1, %ymm2 +# CHECK-NEXT: 1 2 0.25 vpermpd $1, %ymm0, %ymm2 +# CHECK-NEXT: 1 9 0.50 * vpermpd $1, (%rax), %ymm2 +# CHECK-NEXT: 1 2 0.25 vpermps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: 1 9 0.50 * vpermps (%rax), %ymm1, %ymm2 # CHECK-NEXT: 1 2 0.25 vpermq $1, %ymm0, %ymm2 # CHECK-NEXT: 1 9 0.50 * vpermq $1, (%rax), %ymm2 # CHECK-NEXT: 1 100 0.25 * vpgatherdd %xmm0, (%rax,%xmm1,2), %xmm2 diff --git a/llvm/test/tools/llvm-mca/X86/Znver2/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/Znver2/resources-avx2.s index 93eb00e..e77f2f7 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver2/resources-avx2.s +++ b/llvm/test/tools/llvm-mca/X86/Znver2/resources-avx2.s @@ -461,8 +461,8 @@ vpxor (%rax), %ymm1, %ymm2 # CHECK: [1] [2] [3] [4] [5] [6] Instructions: # CHECK-NEXT: 1 8 0.33 * vbroadcasti128 (%rax), %ymm0 -# CHECK-NEXT: 1 100 0.25 vbroadcastsd %xmm0, %ymm0 -# CHECK-NEXT: 1 100 0.25 vbroadcastss %xmm0, %ymm0 +# CHECK-NEXT: 1 2 0.25 vbroadcastsd %xmm0, %ymm0 +# CHECK-NEXT: 1 2 0.25 vbroadcastss %xmm0, %ymm0 # CHECK-NEXT: 1 2 0.25 vextracti128 $1, %ymm0, %xmm2 # CHECK-NEXT: 1 1 0.33 * vextracti128 $1, %ymm0, (%rax) # CHECK-NEXT: 1 100 0.25 * vgatherdpd %xmm0, (%rax,%xmm1,2), %xmm2 @@ -562,10 +562,10 @@ vpxor (%rax), %ymm1, %ymm2 # CHECK-NEXT: 1 9 0.33 * vperm2i128 $1, (%rax), %ymm1, %ymm2 # CHECK-NEXT: 1 2 0.25 vpermd %ymm0, %ymm1, %ymm2 # CHECK-NEXT: 1 9 0.33 * vpermd (%rax), %ymm1, %ymm2 -# CHECK-NEXT: 1 100 0.25 vpermpd $1, %ymm0, %ymm2 -# CHECK-NEXT: 1 107 0.33 * vpermpd $1, (%rax), %ymm2 -# CHECK-NEXT: 1 100 0.25 vpermps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 1 107 0.33 * vpermps (%rax), %ymm1, %ymm2 +# CHECK-NEXT: 1 2 0.25 vpermpd $1, %ymm0, %ymm2 +# CHECK-NEXT: 1 9 0.33 * vpermpd $1, (%rax), %ymm2 +# CHECK-NEXT: 1 2 0.25 vpermps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: 1 9 0.33 * vpermps (%rax), %ymm1, %ymm2 # CHECK-NEXT: 1 2 0.25 vpermq $1, %ymm0, %ymm2 # CHECK-NEXT: 1 9 0.33 * vpermq $1, (%rax), %ymm2 # CHECK-NEXT: 1 100 0.25 * vpgatherdd %xmm0, (%rax,%xmm1,2), %xmm2 -- 2.7.4