From 0c9c92ffc0acde32376b7ba8dcbe7fec90de3ff3 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 3 Mar 2022 12:07:48 +0000 Subject: [PATCH] [X86][XOP] Tidyup VPHADD/VPHSUB unary horizontal ops default schedule class Based off Agner and AMD SoG tables, the XOP VPHADD/VPHSUB unary horizontal ops are as fast as basic arithmetic ops, not the slower SSSE3 binary horizontal add/sub ops. This also matches what the bdver2 model already lists. Noticed while investigating reduction add optimizations. --- llvm/lib/Target/X86/X86InstrXOP.td | 4 +- .../tools/llvm-mca/X86/Generic/resources-xop.s | 122 ++++++++++----------- 2 files changed, 63 insertions(+), 63 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrXOP.td b/llvm/lib/Target/X86/X86InstrXOP.td index a5976b7..d89e481 100644 --- a/llvm/lib/Target/X86/X86InstrXOP.td +++ b/llvm/lib/Target/X86/X86InstrXOP.td @@ -13,11 +13,11 @@ multiclass xop2op opc, string OpcodeStr, Intrinsic Int> { def rr : IXOP, XOP, Sched<[SchedWritePHAdd.XMM]>; + [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[SchedWriteVecALU.XMM]>; def rm : IXOP, XOP, - Sched<[SchedWritePHAdd.XMM.Folded, SchedWritePHAdd.XMM.ReadAfterFold]>; + Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>; } let ExeDomain = SSEPackedInt in { diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-xop.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-xop.s index 8b79a95..cc1fd815 100644 --- a/llvm/test/tools/llvm-mca/X86/Generic/resources-xop.s +++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-xop.s @@ -267,36 +267,36 @@ vpshlw %xmm0, (%rax), %xmm3 # CHECK-NEXT: 1 1 1.00 vpermil2ps $0, %ymm0, %ymm1, %ymm2, %ymm3 # CHECK-NEXT: 2 8 1.00 * vpermil2ps $0, (%rax), %ymm0, %ymm1, %ymm3 # CHECK-NEXT: 2 8 1.00 * vpermil2ps $0, %ymm0, (%rax), %ymm1, %ymm3 -# CHECK-NEXT: 3 3 1.50 vphaddbd %xmm0, %xmm3 -# CHECK-NEXT: 4 9 1.50 * vphaddbd (%rax), %xmm3 -# CHECK-NEXT: 3 3 1.50 vphaddbq %xmm0, %xmm3 -# CHECK-NEXT: 4 9 1.50 * vphaddbq (%rax), %xmm3 -# CHECK-NEXT: 3 3 1.50 vphaddbw %xmm0, %xmm3 -# CHECK-NEXT: 4 9 1.50 * vphaddbw (%rax), %xmm3 -# CHECK-NEXT: 3 3 1.50 vphadddq %xmm0, %xmm3 -# CHECK-NEXT: 4 9 1.50 * vphadddq (%rax), %xmm3 -# CHECK-NEXT: 3 3 1.50 vphaddubd %xmm0, %xmm3 -# CHECK-NEXT: 4 9 1.50 * vphaddubd (%rax), %xmm3 -# CHECK-NEXT: 3 3 1.50 vphaddubq %xmm0, %xmm3 -# CHECK-NEXT: 4 9 1.50 * vphaddubq (%rax), %xmm3 -# CHECK-NEXT: 3 3 1.50 vphaddubw %xmm0, %xmm3 -# CHECK-NEXT: 4 9 1.50 * vphaddubw (%rax), %xmm3 -# CHECK-NEXT: 3 3 1.50 vphaddudq %xmm0, %xmm3 -# CHECK-NEXT: 4 9 1.50 * vphaddudq (%rax), %xmm3 -# CHECK-NEXT: 3 3 1.50 vphadduwd %xmm0, %xmm3 -# CHECK-NEXT: 4 9 1.50 * vphadduwd (%rax), %xmm3 -# CHECK-NEXT: 3 3 1.50 vphadduwq %xmm0, %xmm3 -# CHECK-NEXT: 4 9 1.50 * vphadduwq (%rax), %xmm3 -# CHECK-NEXT: 3 3 1.50 vphaddwd %xmm0, %xmm3 -# CHECK-NEXT: 4 9 1.50 * vphaddwd (%rax), %xmm3 -# CHECK-NEXT: 3 3 1.50 vphaddwq %xmm0, %xmm3 -# CHECK-NEXT: 4 9 1.50 * vphaddwq (%rax), %xmm3 -# CHECK-NEXT: 3 3 1.50 vphsubbw %xmm0, %xmm3 -# CHECK-NEXT: 4 9 1.50 * vphsubbw (%rax), %xmm3 -# CHECK-NEXT: 3 3 1.50 vphsubdq %xmm0, %xmm3 -# CHECK-NEXT: 4 9 1.50 * vphsubdq (%rax), %xmm3 -# CHECK-NEXT: 3 3 1.50 vphsubwd %xmm0, %xmm3 -# CHECK-NEXT: 4 9 1.50 * vphsubwd (%rax), %xmm3 +# CHECK-NEXT: 1 1 0.50 vphaddbd %xmm0, %xmm3 +# CHECK-NEXT: 2 7 0.50 * vphaddbd (%rax), %xmm3 +# CHECK-NEXT: 1 1 0.50 vphaddbq %xmm0, %xmm3 +# CHECK-NEXT: 2 7 0.50 * vphaddbq (%rax), %xmm3 +# CHECK-NEXT: 1 1 0.50 vphaddbw %xmm0, %xmm3 +# CHECK-NEXT: 2 7 0.50 * vphaddbw (%rax), %xmm3 +# CHECK-NEXT: 1 1 0.50 vphadddq %xmm0, %xmm3 +# CHECK-NEXT: 2 7 0.50 * vphadddq (%rax), %xmm3 +# CHECK-NEXT: 1 1 0.50 vphaddubd %xmm0, %xmm3 +# CHECK-NEXT: 2 7 0.50 * vphaddubd (%rax), %xmm3 +# CHECK-NEXT: 1 1 0.50 vphaddubq %xmm0, %xmm3 +# CHECK-NEXT: 2 7 0.50 * vphaddubq (%rax), %xmm3 +# CHECK-NEXT: 1 1 0.50 vphaddubw %xmm0, %xmm3 +# CHECK-NEXT: 2 7 0.50 * vphaddubw (%rax), %xmm3 +# CHECK-NEXT: 1 1 0.50 vphaddudq %xmm0, %xmm3 +# CHECK-NEXT: 2 7 0.50 * vphaddudq (%rax), %xmm3 +# CHECK-NEXT: 1 1 0.50 vphadduwd %xmm0, %xmm3 +# CHECK-NEXT: 2 7 0.50 * vphadduwd (%rax), %xmm3 +# CHECK-NEXT: 1 1 0.50 vphadduwq %xmm0, %xmm3 +# CHECK-NEXT: 2 7 0.50 * vphadduwq (%rax), %xmm3 +# CHECK-NEXT: 1 1 0.50 vphaddwd %xmm0, %xmm3 +# CHECK-NEXT: 2 7 0.50 * vphaddwd (%rax), %xmm3 +# CHECK-NEXT: 1 1 0.50 vphaddwq %xmm0, %xmm3 +# CHECK-NEXT: 2 7 0.50 * vphaddwq (%rax), %xmm3 +# CHECK-NEXT: 1 1 0.50 vphsubbw %xmm0, %xmm3 +# CHECK-NEXT: 2 7 0.50 * vphsubbw (%rax), %xmm3 +# CHECK-NEXT: 1 1 0.50 vphsubdq %xmm0, %xmm3 +# CHECK-NEXT: 2 7 0.50 * vphsubdq (%rax), %xmm3 +# CHECK-NEXT: 1 1 0.50 vphsubwd %xmm0, %xmm3 +# CHECK-NEXT: 2 7 0.50 * vphsubwd (%rax), %xmm3 # CHECK-NEXT: 1 5 1.00 vpmacsdd %xmm0, %xmm1, %xmm2, %xmm3 # CHECK-NEXT: 2 11 1.00 * vpmacsdd %xmm0, (%rax), %xmm1, %xmm3 # CHECK-NEXT: 1 5 1.00 vpmacsdqh %xmm0, %xmm1, %xmm2, %xmm3 @@ -381,7 +381,7 @@ vpshlw %xmm0, (%rax), %xmm3 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] -# CHECK-NEXT: - - 68.00 68.00 - 71.00 41.50 41.50 +# CHECK-NEXT: - - 68.00 38.00 - 41.00 41.50 41.50 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: @@ -431,36 +431,36 @@ vpshlw %xmm0, (%rax), %xmm3 # CHECK-NEXT: - - - - - 1.00 - - vpermil2ps $0, %ymm0, %ymm1, %ymm2, %ymm3 # CHECK-NEXT: - - - - - 1.00 0.50 0.50 vpermil2ps $0, (%rax), %ymm0, %ymm1, %ymm3 # CHECK-NEXT: - - - - - 1.00 0.50 0.50 vpermil2ps $0, %ymm0, (%rax), %ymm1, %ymm3 -# CHECK-NEXT: - - - 1.50 - 1.50 - - vphaddbd %xmm0, %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphaddbd (%rax), %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 - - vphaddbq %xmm0, %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphaddbq (%rax), %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 - - vphaddbw %xmm0, %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphaddbw (%rax), %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 - - vphadddq %xmm0, %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphadddq (%rax), %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 - - vphaddubd %xmm0, %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphaddubd (%rax), %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 - - vphaddubq %xmm0, %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphaddubq (%rax), %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 - - vphaddubw %xmm0, %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphaddubw (%rax), %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 - - vphaddudq %xmm0, %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphaddudq (%rax), %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 - - vphadduwd %xmm0, %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphadduwd (%rax), %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 - - vphadduwq %xmm0, %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphadduwq (%rax), %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 - - vphaddwd %xmm0, %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphaddwd (%rax), %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 - - vphaddwq %xmm0, %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphaddwq (%rax), %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 - - vphsubbw %xmm0, %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphsubbw (%rax), %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 - - vphsubdq %xmm0, %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphsubdq (%rax), %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 - - vphsubwd %xmm0, %xmm3 -# CHECK-NEXT: - - - 1.50 - 1.50 0.50 0.50 vphsubwd (%rax), %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 - - vphaddbd %xmm0, %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphaddbd (%rax), %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 - - vphaddbq %xmm0, %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphaddbq (%rax), %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 - - vphaddbw %xmm0, %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphaddbw (%rax), %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 - - vphadddq %xmm0, %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphadddq (%rax), %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 - - vphaddubd %xmm0, %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphaddubd (%rax), %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 - - vphaddubq %xmm0, %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphaddubq (%rax), %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 - - vphaddubw %xmm0, %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphaddubw (%rax), %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 - - vphaddudq %xmm0, %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphaddudq (%rax), %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 - - vphadduwd %xmm0, %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphadduwd (%rax), %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 - - vphadduwq %xmm0, %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphadduwq (%rax), %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 - - vphaddwd %xmm0, %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphaddwd (%rax), %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 - - vphaddwq %xmm0, %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphaddwq (%rax), %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 - - vphsubbw %xmm0, %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphsubbw (%rax), %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 - - vphsubdq %xmm0, %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphsubdq (%rax), %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 - - vphsubwd %xmm0, %xmm3 +# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 vphsubwd (%rax), %xmm3 # CHECK-NEXT: - - 1.00 - - - - - vpmacsdd %xmm0, %xmm1, %xmm2, %xmm3 # CHECK-NEXT: - - 1.00 - - - 0.50 0.50 vpmacsdd %xmm0, (%rax), %xmm1, %xmm3 # CHECK-NEXT: - - 1.00 - - - - - vpmacsdqh %xmm0, %xmm1, %xmm2, %xmm3 -- 2.7.4