From 96855ec39eb39985d3929a97f6b280e3426b7fd3 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 22 Apr 2018 14:43:12 +0000 Subject: [PATCH] [X86] Remove unnecessary WriteFVarBlend/WriteVarBlend InstRW overrides. This also fixes some of the ReadAfterLd issues due to InstRW. llvm-svn: 330544 --- llvm/lib/Target/X86/X86SchedBroadwell.td | 20 +++---------- llvm/lib/Target/X86/X86SchedHaswell.td | 20 +++---------- llvm/lib/Target/X86/X86SchedSandyBridge.td | 34 ++++------------------ llvm/lib/Target/X86/X86SchedSkylakeClient.td | 28 ++---------------- llvm/lib/Target/X86/X86SchedSkylakeServer.td | 32 ++------------------ llvm/test/CodeGen/X86/avx2-schedule.ll | 2 +- .../llvm-mca/X86/variable-blend-read-after-ld-1.s | 29 +++++++++--------- .../llvm-mca/X86/variable-blend-read-after-ld-2.s | 29 +++++++++--------- 8 files changed, 48 insertions(+), 146 deletions(-) diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td index 22ea724..45d055e 100755 --- a/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -168,7 +168,7 @@ defm : BWWriteResPair; // Floating point and/or/x defm : BWWriteResPair; // Floating point vector shuffles. defm : BWWriteResPair; // Floating point vector variable shuffles. defm : BWWriteResPair; // Floating point vector blends. -defm : BWWriteResPair; // Fp vector variable blends. +defm : BWWriteResPair; // Fp vector variable blends. // FMA Scheduling helper class. // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } @@ -186,7 +186,7 @@ defm : BWWriteResPair; // PMULLD defm : BWWriteResPair; // Vector shuffles. defm : BWWriteResPair; // Vector variable shuffles. defm : BWWriteResPair; // Vector blends. -defm : BWWriteResPair; // Vector variable blends. +defm : BWWriteResPair; // Vector variable blends. defm : BWWriteResPair; // Vector MPSAD. defm : BWWriteResPair; // Vector PSADBW. @@ -470,13 +470,7 @@ def BWWriteResGroup11 : SchedWriteRes<[BWPort5]> { let NumMicroOps = 2; let ResourceCycles = [2]; } -def: InstRW<[BWWriteResGroup11], (instregex "BLENDVPDrr0", - "BLENDVPSrr0", - "MMX_PINSRWrr", - "PBLENDVBrr0", - "VBLENDVPD(Y?)rr", - "VBLENDVPS(Y?)rr", - "VPBLENDVB(Y?)rr", +def: InstRW<[BWWriteResGroup11], (instregex "MMX_PINSRWrr", "(V?)PINSRBrr", "(V?)PINSRDrr", "(V?)PINSRQrr", @@ -1340,17 +1334,11 @@ def BWWriteResGroup79 : SchedWriteRes<[BWPort5,BWPort23]> { let NumMicroOps = 3; let ResourceCycles = [2,1]; } -def: InstRW<[BWWriteResGroup79], (instregex "BLENDVPDrm0", - "BLENDVPSrm0", - "MMX_PACKSSDWirm", +def: InstRW<[BWWriteResGroup79], (instregex "MMX_PACKSSDWirm", "MMX_PACKSSWBirm", "MMX_PACKUSWBirm", - "PBLENDVBrm0", - "VBLENDVPDrm", - "VBLENDVPSrm", "VMASKMOVPDrm", "VMASKMOVPSrm", - "VPBLENDVBrm", "VPMASKMOVDrm", "VPMASKMOVQrm")>; diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td index 18e33a8..4b4c14a 100644 --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -167,7 +167,7 @@ defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; -defm : HWWriteResPair; +defm : HWWriteResPair; // Vector integer operations. def : WriteRes; @@ -184,7 +184,7 @@ defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; -defm : HWWriteResPair; +defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; @@ -1226,13 +1226,7 @@ def HWWriteResGroup27 : SchedWriteRes<[HWPort5]> { let NumMicroOps = 2; let ResourceCycles = [2]; } -def: InstRW<[HWWriteResGroup27], (instregex "BLENDVPDrr0", - "BLENDVPSrr0", - "MMX_PINSRWrr", - "PBLENDVBrr0", - "VBLENDVPD(Y?)rr", - "VBLENDVPS(Y?)rr", - "VPBLENDVB(Y?)rr", +def: InstRW<[HWWriteResGroup27], (instregex "MMX_PINSRWrr", "(V?)PINSRBrr", "(V?)PINSRDrr", "(V?)PINSRQrr", @@ -1337,14 +1331,8 @@ def HWWriteResGroup36 : SchedWriteRes<[HWPort5,HWPort23]> { let NumMicroOps = 3; let ResourceCycles = [2,1]; } -def: InstRW<[HWWriteResGroup36], (instregex "BLENDVPDrm0", - "BLENDVPSrm0", - "PBLENDVBrm0", - "VBLENDVPDrm", - "VBLENDVPSrm", - "VMASKMOVPDrm", +def: InstRW<[HWWriteResGroup36], (instregex "VMASKMOVPDrm", "VMASKMOVPSrm", - "VPBLENDVBrm", "VPMASKMOVDrm", "VPMASKMOVQrm")>; diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td index 5fcfda7..f0fc9a7 100644 --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -154,7 +154,7 @@ defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; -defm : SBWriteResPair; +defm : SBWriteResPair; // Vector integer operations. def : WriteRes; @@ -169,7 +169,7 @@ defm : SBWriteResPair; // TODO this is p defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; -defm : SBWriteResPair; +defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; @@ -472,23 +472,11 @@ def SBWriteResGroup9 : SchedWriteRes<[SBPort05]> { let NumMicroOps = 2; let ResourceCycles = [2]; } -def: InstRW<[SBWriteResGroup9], (instregex "BLENDVPDrr0", - "BLENDVPSrr0", - "ROL(8|16|32|64)r1", +def: InstRW<[SBWriteResGroup9], (instregex "ROL(8|16|32|64)r1", "ROL(8|16|32|64)ri", "ROR(8|16|32|64)r1", "ROR(8|16|32|64)ri", - "SET(A|BE)r", - "VBLENDVPD(Y?)rr", - "VBLENDVPS(Y?)rr")>; - -def SBWriteResGroup10 : SchedWriteRes<[SBPort15]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [2]; -} -def: InstRW<[SBWriteResGroup10], (instregex "PBLENDVBrr0", - "VPBLENDVBrr")>; + "SET(A|BE)r")>; def SBWriteResGroup11 : SchedWriteRes<[SBPort015]> { let Latency = 2; @@ -1229,21 +1217,9 @@ def SBWriteResGroup75 : SchedWriteRes<[SBPort23,SBPort05]> { let NumMicroOps = 3; let ResourceCycles = [1,2]; } -def: InstRW<[SBWriteResGroup75], (instregex "BLENDVPDrm0", - "BLENDVPSrm0", - "VBLENDVPDrm", - "VBLENDVPSrm", - "VMASKMOVPDrm", +def: InstRW<[SBWriteResGroup75], (instregex "VMASKMOVPDrm", "VMASKMOVPSrm")>; -def SBWriteResGroup76 : SchedWriteRes<[SBPort23,SBPort15]> { - let Latency = 8; - let NumMicroOps = 3; - let ResourceCycles = [1,2]; -} -def: InstRW<[SBWriteResGroup76], (instregex "PBLENDVBrm0", - "VPBLENDVBrm")>; - def SBWriteResGroup77 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { let Latency = 8; let NumMicroOps = 3; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td index f661f34..9b1603b 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -165,7 +165,7 @@ defm : SKLWriteResPair; // Floating poi defm : SKLWriteResPair; // Floating point vector shuffles. defm : SKLWriteResPair; // Floating point vector shuffles. defm : SKLWriteResPair; // Floating point vector blends. -defm : SKLWriteResPair; // Fp vector variable blends. +defm : SKLWriteResPair; // Fp vector variable blends. // FMA Scheduling helper class. // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } @@ -183,7 +183,7 @@ defm : SKLWriteResPair; defm : SKLWriteResPair; // Vector shuffles. defm : SKLWriteResPair; // Vector shuffles. defm : SKLWriteResPair; // Vector blends. -defm : SKLWriteResPair; // Vector variable blends. +defm : SKLWriteResPair; // Vector variable blends. defm : SKLWriteResPair; // Vector MPSAD. defm : SKLWriteResPair; // Vector PSADBW. @@ -607,18 +607,6 @@ def: InstRW<[SKLWriteResGroup15], (instregex "CMOV(A|BE)(16|32|64)rr", "ROR(8|16|32|64)ri", "SET(A|BE)r")>; -def SKLWriteResGroup16 : SchedWriteRes<[SKLPort015]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [2]; -} -def: InstRW<[SKLWriteResGroup16], (instregex "BLENDVPDrr0", - "BLENDVPSrr0", - "PBLENDVBrr0", - "VBLENDVPD(Y?)rr", - "VBLENDVPS(Y?)rr", - "VPBLENDVB(Y?)rr")>; - def SKLWriteResGroup17 : SchedWriteRes<[SKLPort0156]> { let Latency = 2; let NumMicroOps = 2; @@ -1726,18 +1714,6 @@ def: InstRW<[SKLWriteResGroup110], (instregex "VANDNPDYrm", "VXORPDYrm", "VXORPSYrm")>; -def SKLWriteResGroup111 : SchedWriteRes<[SKLPort23,SKLPort015]> { - let Latency = 8; - let NumMicroOps = 3; - let ResourceCycles = [1,2]; -} -def: InstRW<[SKLWriteResGroup111], (instregex "BLENDVPDrm0", - "BLENDVPSrm0", - "PBLENDVBrm0", - "VBLENDVPDrm", - "VBLENDVPSrm", - "VPBLENDVB(Y?)rm")>; - def SKLWriteResGroup112 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { let Latency = 8; let NumMicroOps = 4; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index 7268358..b3b30e4 100755 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -165,7 +165,7 @@ defm : SKXWriteResPair; // Floating poi defm : SKXWriteResPair; // Floating point vector shuffles. defm : SKXWriteResPair; // Floating point vector variable shuffles. defm : SKXWriteResPair; // Floating point vector blends. -defm : SKXWriteResPair; // Fp vector variable blends. +defm : SKXWriteResPair; // Fp vector variable blends. // FMA Scheduling helper class. // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } @@ -183,7 +183,7 @@ defm : SKXWriteResPair; // Vector in defm : SKXWriteResPair; // Vector shuffles. defm : SKXWriteResPair; // Vector variable shuffles. defm : SKXWriteResPair; // Vector blends. -defm : SKXWriteResPair; // Vector variable blends. +defm : SKXWriteResPair; // Vector variable blends. defm : SKXWriteResPair; // Vector MPSAD. defm : SKXWriteResPair; // Vector PSADBW. @@ -1081,21 +1081,6 @@ def: InstRW<[SKXWriteResGroup15], (instregex "CMOV(A|BE)(16|32|64)rr", "ROR(8|16|32|64)ri", "SET(A|BE)r")>; -def SKXWriteResGroup16 : SchedWriteRes<[SKXPort015]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [2]; -} -def: InstRW<[SKXWriteResGroup16], (instregex "BLENDVPDrr0", - "BLENDVPSrr0", - "PBLENDVBrr0", - "VBLENDVPDYrr", - "VBLENDVPDrr", - "VBLENDVPSYrr", - "VBLENDVPSrr", - "VPBLENDVBYrr", - "VPBLENDVBrr")>; - def SKXWriteResGroup17 : SchedWriteRes<[SKXPort0156]> { let Latency = 2; let NumMicroOps = 2; @@ -3568,19 +3553,6 @@ def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPDYrm", "VXORPSZ256rm(b?)", "VXORPSZrm(b?)")>; -def SKXWriteResGroup122 : SchedWriteRes<[SKXPort23,SKXPort015]> { - let Latency = 8; - let NumMicroOps = 3; - let ResourceCycles = [1,2]; -} -def: InstRW<[SKXWriteResGroup122], (instregex "BLENDVPDrm0", - "BLENDVPSrm0", - "PBLENDVBrm0", - "VBLENDVPDrm", - "VBLENDVPSrm", - "VPBLENDVBYrm", - "VPBLENDVBrm")>; - def SKXWriteResGroup123 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { let Latency = 8; let NumMicroOps = 4; diff --git a/llvm/test/CodeGen/X86/avx2-schedule.ll b/llvm/test/CodeGen/X86/avx2-schedule.ll index 6adfed7..065bc62 100644 --- a/llvm/test/CodeGen/X86/avx2-schedule.ll +++ b/llvm/test/CodeGen/X86/avx2-schedule.ll @@ -1670,7 +1670,7 @@ define <32 x i8> @test_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2, <32 ; GENERIC-LABEL: test_pblendvb: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; GENERIC-NEXT: vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; GENERIC-NEXT: vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pblendvb: diff --git a/llvm/test/tools/llvm-mca/X86/variable-blend-read-after-ld-1.s b/llvm/test/tools/llvm-mca/X86/variable-blend-read-after-ld-1.s index 9f774a8..4451597 100644 --- a/llvm/test/tools/llvm-mca/X86/variable-blend-read-after-ld-1.s +++ b/llvm/test/tools/llvm-mca/X86/variable-blend-read-after-ld-1.s @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=SANDY # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=ivybridge -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=IVY @@ -16,23 +17,23 @@ vaddps %xmm0, %xmm0, %xmm1 vblendvps %xmm1, (%rdi), %xmm2, %xmm3 -# SANDY: [0,0] DeeeER . . vaddps %xmm0, %xmm0, %xmm1 -# SANDY-NEXT: [0,1] D===eeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 +# SANDY: [0,0] DeeeER . vaddps %xmm0, %xmm0, %xmm1 +# SANDY-NEXT: [0,1] DeeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 -# IVY: [0,0] DeeeER . . vaddps %xmm0, %xmm0, %xmm1 -# IVY-NEXT: [0,1] D===eeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 +# IVY: [0,0] DeeeER . vaddps %xmm0, %xmm0, %xmm1 +# IVY-NEXT: [0,1] DeeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 -# HASWELL: [0,0] DeeeER . . vaddps %xmm0, %xmm0, %xmm1 -# HASWELL-NEXT: [0,1] D===eeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 +# HASWELL: [0,0] DeeeER . vaddps %xmm0, %xmm0, %xmm1 +# HASWELL-NEXT: [0,1] DeeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 -# BDWELL: [0,0] DeeeER . . vaddps %xmm0, %xmm0, %xmm1 -# BDWELL-NEXT: [0,1] D===eeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 +# BDWELL: [0,0] DeeeER . vaddps %xmm0, %xmm0, %xmm1 +# BDWELL-NEXT: [0,1] DeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 -# SKYLAKE: [0,0] DeeeeER . . vaddps %xmm0, %xmm0, %xmm1 -# SKYLAKE-NEXT: [0,1] D====eeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 +# SKYLAKE: [0,0] DeeeeER . vaddps %xmm0, %xmm0, %xmm1 +# SKYLAKE-NEXT: [0,1] DeeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 -# BTVER2: [0,0] DeeeER . vaddps %xmm0, %xmm0, %xmm1 -# BTVER2-NEXT: [0,1] .DeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 +# BTVER2: [0,0] DeeeER . vaddps %xmm0, %xmm0, %xmm1 +# BTVER2-NEXT: [0,1] .DeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 -# ZNVER1: [0,0] DeeeER . vaddps %xmm0, %xmm0, %xmm1 -# ZNVER1-NEXT: [0,1] DeeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 +# ZNVER1: [0,0] DeeeER . vaddps %xmm0, %xmm0, %xmm1 +# ZNVER1-NEXT: [0,1] DeeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 diff --git a/llvm/test/tools/llvm-mca/X86/variable-blend-read-after-ld-2.s b/llvm/test/tools/llvm-mca/X86/variable-blend-read-after-ld-2.s index 8beea9a..77ef121 100644 --- a/llvm/test/tools/llvm-mca/X86/variable-blend-read-after-ld-2.s +++ b/llvm/test/tools/llvm-mca/X86/variable-blend-read-after-ld-2.s @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=SANDY # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=ivybridge -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=IVY @@ -16,23 +17,23 @@ vaddps %xmm0, %xmm0, %xmm2 vblendvps %xmm1, (%rdi), %xmm2, %xmm3 -# SANDY: [0,0] DeeeER . . vaddps %xmm0, %xmm0, %xmm2 -# SANDY-NEXT: [0,1] D===eeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 +# SANDY: [0,0] DeeeER . vaddps %xmm0, %xmm0, %xmm2 +# SANDY-NEXT: [0,1] DeeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 -# IVY: [0,0] DeeeER . . vaddps %xmm0, %xmm0, %xmm2 -# IVY-NEXT: [0,1] D===eeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 +# IVY: [0,0] DeeeER . vaddps %xmm0, %xmm0, %xmm2 +# IVY-NEXT: [0,1] DeeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 -# HASWELL: [0,0] DeeeER . . vaddps %xmm0, %xmm0, %xmm2 -# HASWELL-NEXT: [0,1] D===eeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 +# HASWELL: [0,0] DeeeER . vaddps %xmm0, %xmm0, %xmm2 +# HASWELL-NEXT: [0,1] DeeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 -# BDWELL: [0,0] DeeeER . . vaddps %xmm0, %xmm0, %xmm2 -# BDWELL-NEXT: [0,1] D===eeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 +# BDWELL: [0,0] DeeeER . vaddps %xmm0, %xmm0, %xmm2 +# BDWELL-NEXT: [0,1] DeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 -# SKYLAKE: [0,0] DeeeeER . . vaddps %xmm0, %xmm0, %xmm2 -# SKYLAKE-NEXT: [0,1] D====eeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 +# SKYLAKE: [0,0] DeeeeER . vaddps %xmm0, %xmm0, %xmm2 +# SKYLAKE-NEXT: [0,1] DeeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 -# BTVER2: [0,0] DeeeER . vaddps %xmm0, %xmm0, %xmm2 -# BTVER2-NEXT: [0,1] .DeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 +# BTVER2: [0,0] DeeeER . vaddps %xmm0, %xmm0, %xmm2 +# BTVER2-NEXT: [0,1] .DeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 -# ZNVER1: [0,0] DeeeER . vaddps %xmm0, %xmm0, %xmm2 -# ZNVER1-NEXT: [0,1] DeeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 +# ZNVER1: [0,0] DeeeER . vaddps %xmm0, %xmm0, %xmm2 +# ZNVER1-NEXT: [0,1] DeeeeeeeeER vblendvps %xmm1, (%rdi), %xmm2, %xmm3 -- 2.7.4