From 389480999746fe26858b709df9ec7ee3837d82b8 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 15 Mar 2018 23:00:47 +0000 Subject: [PATCH] [X86][Btver2] Fix ymm div/sqrt to use fmul unit YMM FDiv/FSqrt are dispatched on pipe JFPU1 but should be performed on the JFPM unit - that is where most of the cycles are spent. This matches the pipes for WriteFSqrt/WriteFDiv definitions. llvm-svn: 327682 --- llvm/lib/Target/X86/X86ScheduleBtVer2.td | 24 +++++------ llvm/test/tools/llvm-mca/X86/BtVer2/pipes-fpu.s | 53 ++++++++++++------------- 2 files changed, 38 insertions(+), 39 deletions(-) diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td index bc777c2..1789700 100644 --- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -488,15 +488,15 @@ def : InstRW<[JWriteFAddYLd, ReadAfterLd], (instrs VADDPDYrm, VADDPSYrm, VSUBPDYrm, VSUBPSYrm, VADDSUBPDYrm, VADDSUBPSYrm)>; -def JWriteFDivY: SchedWriteRes<[JFPU1]> { +def JWriteFDivY: SchedWriteRes<[JFPU1, JFPM]> { let Latency = 38; - let ResourceCycles = [38]; + let ResourceCycles = [1, 38]; } def : InstRW<[JWriteFDivY], (instrs VDIVPDYrr, VDIVPSYrr)>; -def JWriteFDivYLd: SchedWriteRes<[JLAGU, JFPU1]> { +def JWriteFDivYLd: SchedWriteRes<[JLAGU, JFPU1, JFPM]> { let Latency = 43; - let ResourceCycles = [1, 38]; + let ResourceCycles = [1, 1, 38]; } def : InstRW<[JWriteFDivYLd, ReadAfterLd], (instrs VDIVPDYrm, VDIVPSYrm)>; @@ -752,27 +752,27 @@ def JWriteVTESTLd: SchedWriteRes<[JLAGU, JFPU0]> { } def : InstRW<[JWriteVTESTLd], (instrs PTESTrm, VPTESTrm, VTESTPDrm, VTESTPSrm)>; -def JWriteVSQRTYPD: SchedWriteRes<[JFPU1]> { +def JWriteVSQRTYPD: SchedWriteRes<[JFPU1, JFPM]> { let Latency = 54; - let ResourceCycles = [54]; + let ResourceCycles = [1, 54]; } def : InstRW<[JWriteVSQRTYPD], (instrs VSQRTPDYr)>; -def JWriteVSQRTYPDLd: SchedWriteRes<[JLAGU, JFPU1]> { +def JWriteVSQRTYPDLd: SchedWriteRes<[JLAGU, JFPU1, JFPM]> { let Latency = 59; - let ResourceCycles = [1, 54]; + let ResourceCycles = [1, 1, 54]; } def : InstRW<[JWriteVSQRTYPDLd], (instrs VSQRTPDYm)>; -def JWriteVSQRTYPS: SchedWriteRes<[JFPU1]> { +def JWriteVSQRTYPS: SchedWriteRes<[JFPU1, JFPM]> { let Latency = 42; - let ResourceCycles = [42]; + let ResourceCycles = [1, 42]; } def : InstRW<[JWriteVSQRTYPS], (instrs VSQRTPSYr)>; -def JWriteVSQRTYPSLd: SchedWriteRes<[JLAGU, JFPU1]> { +def JWriteVSQRTYPSLd: SchedWriteRes<[JLAGU, JFPU1, JFPM]> { let Latency = 47; - let ResourceCycles = [1, 42]; + let ResourceCycles = [1, 1, 42]; } def : InstRW<[JWriteVSQRTYPSLd], (instrs VSQRTPSYm)>; diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/pipes-fpu.s b/llvm/test/tools/llvm-mca/X86/BtVer2/pipes-fpu.s index 0d7b5dc..4fc081a 100644 --- a/llvm/test/tools/llvm-mca/X86/BtVer2/pipes-fpu.s +++ b/llvm/test/tools/llvm-mca/X86/BtVer2/pipes-fpu.s @@ -19,9 +19,9 @@ vsqrtps %ymm0, %ymm1 # CHECK: Iterations: 70 # CHECK-NEXT: Instructions: 560 -# CHECK-NEXT: Total Cycles: 3155 +# CHECK-NEXT: Total Cycles: 4484 # CHECK-NEXT: Dispatch Width: 2 -# CHECK-NEXT: IPC: 0.18 +# CHECK-NEXT: IPC: 0.12 # CHECK: Instruction Info: @@ -61,8 +61,8 @@ vsqrtps %ymm0, %ymm1 # CHECK: Resource pressure per iteration: -# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] -# CHECK-NEXT: - - - - 21.00 5.00 45.00 - - - - - - 1.00 +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: - - - - 63.00 5.00 4.00 - - - - - - 1.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: @@ -73,28 +73,27 @@ vsqrtps %ymm0, %ymm1 # CHECK-NEXT: - - - - - 1.00 - - - - - - - - vaddps %xmm0, %xmm1, %xmm2 # CHECK-NEXT: - - - - 21.00 - 1.00 - - - - - - - vsqrtps %xmm0, %xmm1 # CHECK-NEXT: - - - - - 2.00 - - - - - - - - vaddps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: - - - - - - 42.00 - - - - - - - vsqrtps %ymm0, %ymm1 +# CHECK-NEXT: - - - - 42.00 - 1.00 - - - - - - - vsqrtps %ymm0, %ymm1 # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 0123456789 0123456789 0 +# CHECK-NEXT: 0123456789 0123456789 0123456789 # CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 -# CHECK: [0,0] DeeER. . . . . . . . . . . . . . vpmulld %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [0,1] DeE-R. . . . . . . . . . . . . . vpand %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [0,2] .DeeeER . . . . . . . . . . . . . vcvttps2dq %xmm0, %xmm1 -# CHECK-NEXT: [0,3] .D===eeER . . . . . . . . . . . . . vpclmulqdq $0, %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [0,4] . D===eeeER . . . . . . . . . . . . vaddps %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [0,5] . DeeeeeeeeeeeeeeeeeeeeeER . . . . . . . . . vsqrtps %xmm0, %xmm1 -# CHECK-NEXT: [0,6] . D====================eeeER . . . . . . . . . vaddps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: [0,7] . DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER . . . . . vsqrtps %ymm0, %ymm1 +# CHECK: [0,0] DeeER. . . . . . . . . . . . . . vpmulld %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [0,1] DeE-R. . . . . . . . . . . . . . vpand %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [0,2] .DeeeER . . . . . . . . . . . . . vcvttps2dq %xmm0, %xmm1 +# CHECK-NEXT: [0,3] .D===eeER . . . . . . . . . . . . . vpclmulqdq $0, %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [0,4] . D===eeeER . . . . . . . . . . . . vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [0,5] . DeeeeeeeeeeeeeeeeeeeeeER . . . . . . . . . vsqrtps %xmm0, %xmm1 +# CHECK-NEXT: [0,6] . D====================eeeER . . . . . . . . . vaddps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: [0,7] . D====================eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER . vsqrtps %ymm0, %ymm1 -# CHECK: [1,0] . D=========================================eeER. . . . . vpmulld %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [1,1] . D=========================================eE-R. . . . . vpand %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [1,2] . D=========================================eeeER . . . . vcvttps2dq %xmm0, %xmm1 -# CHECK-NEXT: [1,3] . D============================================eeER . . . . vpclmulqdq $0, %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [1,4] . .D============================================eeeER . . . vaddps %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [1,5] . .D=========================================eeeeeeeeeeeeeeeeeeeeeER vsqrtps %xmm0, %xmm1 +# CHECK: [1,0] . D=============================================================eeER vpmulld %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [1,1] . D=============================================================eE-R vpand %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [1,2] . DeeeE-----------------------------------------------------------R vcvttps2dq %xmm0, %xmm1 +# CHECK-NEXT: [1,3] . D===eeE---------------------------------------------------------R vpclmulqdq $0, %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [1,4] . .D===eeeE-------------------------------------------------------R vaddps %xmm0, %xmm1, %xmm2 # CHECK: Average Wait times (based on the timeline view): @@ -104,11 +103,11 @@ vsqrtps %ymm0, %ymm1 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 21.5 0.5 0.0 vpmulld %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 1. 2 21.5 0.5 1.0 vpand %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 2. 2 21.5 21.5 0.0 vcvttps2dq %xmm0, %xmm1 -# CHECK-NEXT: 3. 2 24.5 0.0 0.0 vpclmulqdq $0, %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 4. 2 24.5 1.0 0.0 vaddps %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 5. 2 21.5 21.5 0.0 vsqrtps %xmm0, %xmm1 +# CHECK-NEXT: 0. 2 31.5 0.5 0.0 vpmulld %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 1. 2 31.5 0.5 1.0 vpand %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 2. 2 1.0 1.0 29.5 vcvttps2dq %xmm0, %xmm1 +# CHECK-NEXT: 3. 2 4.0 0.0 28.5 vpclmulqdq $0, %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 4. 2 4.0 1.0 27.5 vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 5. 1 1.0 1.0 0.0 vsqrtps %xmm0, %xmm1 # CHECK-NEXT: 6. 1 21.0 0.0 0.0 vaddps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 7. 1 1.0 1.0 0.0 vsqrtps %ymm0, %ymm1 +# CHECK-NEXT: 7. 1 21.0 21.0 0.0 vsqrtps %ymm0, %ymm1 -- 2.7.4