[X86][Sched] A bunch of fixes to the Zen2 sched model latencies.
authorClement Courbet <courbet@google.com>
Wed, 22 Jan 2020 10:44:12 +0000 (11:44 +0100)
committerClement Courbet <courbet@google.com>
Thu, 30 Jan 2020 09:20:31 +0000 (10:20 +0100)
Summary:
As determined with `llvm-exegesis`.

Some of these look like typos/misunderstandings of the sched model td
spec:
  - latency defaults to `1` when not set => Maybe we can avoid
    having a default ?
  - problems with regexps not being anchored by default (XCHG matching
    CMPXHG)

Note that this is not complete, it fixes only the most obvious mistakes,
and only for latency (not uops).

Reviewers: RKSimon, GGanesh

Subscribers: hiraditya, jfb, mstojanovic, hfinkel, craig.topper, andreadb, lebedev.ri, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D73172

llvm/lib/Target/X86/X86ScheduleZnver2.td
llvm/test/tools/llvm-mca/X86/Znver2/resources-avx1.s
llvm/test/tools/llvm-mca/X86/Znver2/resources-avx2.s
llvm/test/tools/llvm-mca/X86/Znver2/resources-sse1.s
llvm/test/tools/llvm-mca/X86/Znver2/resources-sse2.s
llvm/test/tools/llvm-mca/X86/Znver2/resources-sse3.s
llvm/test/tools/llvm-mca/X86/Znver2/resources-sse41.s
llvm/test/tools/llvm-mca/X86/Znver2/resources-sse4a.s
llvm/test/tools/llvm-mca/X86/Znver2/resources-ssse3.s
llvm/test/tools/llvm-mca/X86/Znver2/resources-x86_64.s

index e844317..1517ae7 100644 (file)
@@ -187,7 +187,7 @@ defm : Zn2WriteResPair<WriteIMul8,     [Zn2ALU1, Zn2Multiplier], 4>;
 
 defm : X86WriteRes<WriteBSWAP32, [Zn2ALU], 1, [4], 1>;
 defm : X86WriteRes<WriteBSWAP64, [Zn2ALU], 1, [4], 1>;
-defm : X86WriteRes<WriteCMPXCHG, [Zn2ALU], 1, [1], 1>;
+defm : X86WriteRes<WriteCMPXCHG, [Zn2ALU], 3, [1], 1>;
 defm : X86WriteRes<WriteCMPXCHGRMW,[Zn2ALU,Zn2AGU], 8, [1,1], 5>;
 defm : X86WriteRes<WriteXCHG, [Zn2ALU], 1, [2], 2>;
 
@@ -216,7 +216,7 @@ defm : X86WriteRes<WriteBitTestSet,      [Zn2ALU], 2, [1], 2>;
 
 // Bit counts.
 defm : Zn2WriteResPair<WriteBSF, [Zn2ALU], 3>;
-defm : Zn2WriteResPair<WriteBSR, [Zn2ALU], 3>;
+defm : Zn2WriteResPair<WriteBSR, [Zn2ALU], 4>;
 defm : Zn2WriteResPair<WriteLZCNT,          [Zn2ALU], 1>;
 defm : Zn2WriteResPair<WriteTZCNT,          [Zn2ALU], 2>;
 defm : Zn2WriteResPair<WritePOPCNT,         [Zn2ALU], 1>;
@@ -272,13 +272,13 @@ defm : Zn2WriteResFpuPair<WriteFAdd64,    [Zn2FPU0],  3>;
 defm : Zn2WriteResFpuPair<WriteFAdd64X,   [Zn2FPU0],  3>;
 defm : Zn2WriteResFpuPair<WriteFAdd64Y,   [Zn2FPU0],  3>;
 defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
-defm : Zn2WriteResFpuPair<WriteFCmp,      [Zn2FPU0],  3>;
-defm : Zn2WriteResFpuPair<WriteFCmpX,     [Zn2FPU0],  3>;
-defm : Zn2WriteResFpuPair<WriteFCmpY,     [Zn2FPU0],  3>;
+defm : Zn2WriteResFpuPair<WriteFCmp,      [Zn2FPU0],  1>;
+defm : Zn2WriteResFpuPair<WriteFCmpX,     [Zn2FPU0],  1>;
+defm : Zn2WriteResFpuPair<WriteFCmpY,     [Zn2FPU0],  1>;
 defm : X86WriteResPairUnsupported<WriteFCmpZ>;
-defm : Zn2WriteResFpuPair<WriteFCmp64,    [Zn2FPU0],  3>;
-defm : Zn2WriteResFpuPair<WriteFCmp64X,   [Zn2FPU0],  3>;
-defm : Zn2WriteResFpuPair<WriteFCmp64Y,   [Zn2FPU0],  3>;
+defm : Zn2WriteResFpuPair<WriteFCmp64,    [Zn2FPU0],  1>;
+defm : Zn2WriteResFpuPair<WriteFCmp64X,   [Zn2FPU0],  1>;
+defm : Zn2WriteResFpuPair<WriteFCmp64Y,   [Zn2FPU0],  1>;
 defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
 defm : Zn2WriteResFpuPair<WriteFCom,      [Zn2FPU0],  3>;
 defm : Zn2WriteResFpuPair<WriteFComX,     [Zn2FPU0],  3>;
@@ -314,8 +314,8 @@ defm : Zn2WriteResFpuPair<WriteFDiv64,    [Zn2FPU3], 15>;
 defm : Zn2WriteResFpuPair<WriteFDiv64X,   [Zn2FPU3], 15>;
 defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
 defm : Zn2WriteResFpuPair<WriteFSign,     [Zn2FPU3],  2>;
-defm : Zn2WriteResFpuPair<WriteFRnd,      [Zn2FPU3],  4, [1], 1, 7, 0>;
-defm : Zn2WriteResFpuPair<WriteFRndY,     [Zn2FPU3],  4, [1], 1, 7, 0>;
+defm : Zn2WriteResFpuPair<WriteFRnd,      [Zn2FPU3],  3, [1], 1, 7, 0>;
+defm : Zn2WriteResFpuPair<WriteFRndY,     [Zn2FPU3],  3, [1], 1, 7, 0>;
 defm : X86WriteResPairUnsupported<WriteFRndZ>;
 defm : Zn2WriteResFpuPair<WriteFLogic,    [Zn2FPU],   1>;
 defm : Zn2WriteResFpuPair<WriteFLogicY,   [Zn2FPU],   1>;
@@ -326,16 +326,16 @@ defm : X86WriteResPairUnsupported<WriteFTestZ>;
 defm : Zn2WriteResFpuPair<WriteFShuffle,  [Zn2FPU12], 1>;
 defm : Zn2WriteResFpuPair<WriteFShuffleY, [Zn2FPU12], 1>;
 defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
-defm : Zn2WriteResFpuPair<WriteFVarShuffle, [Zn2FPU12], 1>;
-defm : Zn2WriteResFpuPair<WriteFVarShuffleY,[Zn2FPU12], 1>;
+defm : Zn2WriteResFpuPair<WriteFVarShuffle, [Zn2FPU12], 3>;
+defm : Zn2WriteResFpuPair<WriteFVarShuffleY,[Zn2FPU12], 3>;
 defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
 defm : Zn2WriteResFpuPair<WriteFMul,      [Zn2FPU01], 3, [1], 1, 7, 1>;
 defm : Zn2WriteResFpuPair<WriteFMulX,     [Zn2FPU01], 3, [1], 1, 7, 1>;
-defm : Zn2WriteResFpuPair<WriteFMulY,     [Zn2FPU01], 4, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteFMulY,     [Zn2FPU01], 3, [1], 1, 7, 1>;
 defm : X86WriteResPairUnsupported<WriteFMulZ>;
 defm : Zn2WriteResFpuPair<WriteFMul64,    [Zn2FPU01], 3, [1], 1, 7, 1>;
 defm : Zn2WriteResFpuPair<WriteFMul64X,   [Zn2FPU01], 3, [1], 1, 7, 1>;
-defm : Zn2WriteResFpuPair<WriteFMul64Y,   [Zn2FPU01], 4, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteFMul64Y,   [Zn2FPU01], 3, [1], 1, 7, 1>;
 defm : X86WriteResPairUnsupported<WriteFMul64Z>;
 defm : Zn2WriteResFpuPair<WriteFMA,       [Zn2FPU03], 5>;
 defm : Zn2WriteResFpuPair<WriteFMAX,      [Zn2FPU03], 5>;
@@ -381,7 +381,7 @@ defm : X86WriteRes<WriteEMMS,            [Zn2FPU], 2, [1], 1>;
 
 defm : Zn2WriteResFpuPair<WriteVecShift,   [Zn2FPU],   1>;
 defm : Zn2WriteResFpuPair<WriteVecShiftX,  [Zn2FPU2],  1>;
-defm : Zn2WriteResFpuPair<WriteVecShiftY,  [Zn2FPU2],  2>;
+defm : Zn2WriteResFpuPair<WriteVecShiftY,  [Zn2FPU2],  1>;
 defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
 defm : Zn2WriteResFpuPair<WriteVecShiftImm,  [Zn2FPU], 1>;
 defm : Zn2WriteResFpuPair<WriteVecShiftImmX, [Zn2FPU], 1>;
@@ -403,7 +403,7 @@ defm : Zn2WriteResFpuPair<WriteVecIMulX,   [Zn2FPU0],  4>;
 defm : Zn2WriteResFpuPair<WriteVecIMulY,   [Zn2FPU0],  4>;
 defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
 defm : Zn2WriteResFpuPair<WritePMULLD,     [Zn2FPU0],  4, [1], 1, 7, 1>;
-defm : Zn2WriteResFpuPair<WritePMULLDY,    [Zn2FPU0],  3, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WritePMULLDY,    [Zn2FPU0],  4, [1], 1, 7, 1>;
 defm : X86WriteResPairUnsupported<WritePMULLDZ>;
 defm : Zn2WriteResFpuPair<WriteShuffle,    [Zn2FPU],   1>;
 defm : Zn2WriteResFpuPair<WriteShuffleX,   [Zn2FPU],   1>;
@@ -425,8 +425,8 @@ defm : X86WriteResPairUnsupported<WritePSADBWZ>;
 defm : Zn2WriteResFpuPair<WritePHMINPOS,   [Zn2FPU0],  4>;
 
 // Vector Shift Operations
-defm : Zn2WriteResFpuPair<WriteVarVecShift,  [Zn2FPU12], 1>;
-defm : Zn2WriteResFpuPair<WriteVarVecShiftY, [Zn2FPU12], 1>;
+defm : Zn2WriteResFpuPair<WriteVarVecShift,  [Zn2FPU12], 3>;
+defm : Zn2WriteResFpuPair<WriteVarVecShiftY, [Zn2FPU12], 3>;
 defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
 
 // Vector insert/extract operations.
@@ -470,6 +470,12 @@ defm : Zn2WriteResFpuPair<WriteFVarShuffle256, [Zn2FPU], 100>;
 def Zn2WriteMicrocoded : SchedWriteRes<[]> {
   let Latency = 100;
 }
+defm : Zn2WriteResPair<WriteDPPS, [], 15>;
+defm : Zn2WriteResPair<WriteFHAdd, [], 7>;
+defm : Zn2WriteResPair<WriteFHAddY, [], 7>;
+defm : Zn2WriteResPair<WritePHAdd, [], 3>;
+defm : Zn2WriteResPair<WritePHAddX, [], 3>;
+defm : Zn2WriteResPair<WritePHAddY, [], 3>;
 
 def : SchedAlias<WriteMicrocoded, Zn2WriteMicrocoded>;
 def : SchedAlias<WriteFCMOV, Zn2WriteMicrocoded>;
@@ -518,14 +524,14 @@ def Zn2WriteXCHG : SchedWriteRes<[Zn2ALU]> {
   let NumMicroOps = 2;
 }
 
-def : InstRW<[Zn2WriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>;
+def : InstRW<[Zn2WriteXCHG], (instregex "^XCHG(8|16|32|64)rr", "^XCHG(16|32|64)ar")>;
 
 // r,m.
 def Zn2WriteXCHGrm : SchedWriteRes<[Zn2AGU, Zn2ALU]> {
   let Latency = 5;
   let NumMicroOps = 2;
 }
-def : InstRW<[Zn2WriteXCHGrm, ReadAfterLd], (instregex "XCHG(8|16|32|64)rm")>;
+def : InstRW<[Zn2WriteXCHGrm, ReadAfterLd], (instregex "^XCHG(8|16|32|64)rm")>;
 
 def : InstRW<[WriteMicrocoded], (instrs XLAT)>;
 
@@ -595,8 +601,11 @@ def : InstRW<[WriteALULd],
 def Zn2WriteMul16 : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> {
   let Latency = 3;
 }
+def Zn2WriteMul16Imm : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> {
+  let Latency = 4;
+}
 def : SchedAlias<WriteIMul16, Zn2WriteMul16>;
-def : SchedAlias<WriteIMul16Imm, Zn2WriteMul16>;
+def : SchedAlias<WriteIMul16Imm, Zn2WriteMul16Imm>;
 def : SchedAlias<WriteIMul16Reg, Zn2WriteMul16>;
 
 // m16.
@@ -1002,6 +1011,7 @@ def : InstRW<[WriteMicrocoded], (instrs FNINIT)>;
 // mm <- mm.
 def Zn2WriteFPU12 : SchedWriteRes<[Zn2FPU12]> ;
 def Zn2WriteFPU12Y : SchedWriteRes<[Zn2FPU12]> {
+  let Latency = 4;
   let NumMicroOps = 2;
 }
 def Zn2WriteFPU12m : SchedWriteRes<[Zn2AGU, Zn2FPU12]> ;
@@ -1110,15 +1120,6 @@ def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>;
 
 //-- Arithmetic instructions --//
 
-// HADD, HSUB PS/PD
-// PHADD|PHSUB (S) W/D.
-def : SchedAlias<WritePHAdd,    Zn2WriteMicrocoded>;
-def : SchedAlias<WritePHAddLd,  Zn2WriteMicrocoded>;
-def : SchedAlias<WritePHAddX,   Zn2WriteMicrocoded>;
-def : SchedAlias<WritePHAddXLd, Zn2WriteMicrocoded>;
-def : SchedAlias<WritePHAddY,   Zn2WriteMicrocoded>;
-def : SchedAlias<WritePHAddYLd, Zn2WriteMicrocoded>;
-
 // PCMPGTQ.
 def Zn2WritePCMPGTQr : SchedWriteRes<[Zn2FPU03]>;
 def : InstRW<[Zn2WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>;
@@ -1138,8 +1139,12 @@ def : InstRW<[Zn2WritePCMPGTQYm], (instrs VPCMPGTQYrm)>;
 
 // PSLL,PSRL,PSRA W/D/Q.
 // x,x / v,v,x.
-def Zn2WritePShift  : SchedWriteRes<[Zn2FPU2]> ;
-def Zn2WritePShiftY : SchedWriteRes<[Zn2FPU2]> ;
+def Zn2WritePShift  : SchedWriteRes<[Zn2FPU2]> {
+  let Latency = 3;
+}
+def Zn2WritePShiftY : SchedWriteRes<[Zn2FPU2]> {
+  let Latency = 3;
+}
 
 // PSLL,PSRL DQ.
 def : InstRW<[Zn2WritePShift], (instregex "(V?)PS(R|L)LDQri")>;
@@ -1281,7 +1286,7 @@ def Zn2WriteCVTDQ2PDr: SchedWriteRes<[Zn2FPU12,Zn2FPU3]> {
 }
 // CVTDQ2PD.
 // x,x.
-def : InstRW<[Zn2WriteCVTDQ2PDr], (instregex "(V)?CVTDQ2PDrr")>;
+def : InstRW<[Zn2WriteCVTDQ2PDr], (instregex "(V)?CVTDQ2P(D|S)rr")>;
 
 // Same as xmm
 // y,x.
@@ -1291,9 +1296,9 @@ def : InstRW<[Zn2WriteCVTDQ2PDr], (instrs VCVTDQ2PSYrr)>;
 def Zn2WriteCVTPD2DQr: SchedWriteRes<[Zn2FPU12, Zn2FPU3]> {
   let Latency = 3;
 }
-// CVT(T)PD2DQ.
+// CVT(T)P(D|S)2DQ.
 // x,x.
-def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "(V?)CVT(T?)PD2DQrr")>;
+def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "(V?)CVT(T?)P(D|S)2DQrr")>;
 
 def Zn2WriteCVTPD2DQLd: SchedWriteRes<[Zn2AGU,Zn2FPU12,Zn2FPU3]> {
   let Latency = 10;
@@ -1323,7 +1328,7 @@ def : InstRW<[Zn2WriteCVTPS2PDr], (instrs MMX_CVTPI2PDirr)>;
 def : InstRW<[Zn2WriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIirr")>;
 
 def Zn2WriteCVSTSI2SSr: SchedWriteRes<[Zn2FPU3]> {
-  let Latency = 4;
+  let Latency = 3;
 }
 
 // same as CVTPD2DQr
@@ -1335,7 +1340,7 @@ def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "(V?)CVT(T?)SS2SI(64)?rr")>;
 def : InstRW<[Zn2WriteCVTPD2DQLd], (instregex "(V?)CVT(T?)SS2SI(64)?rm")>;
 
 def Zn2WriteCVSTSI2SDr: SchedWriteRes<[Zn2FPU013, Zn2FPU3]> {
-  let Latency = 4;
+  let Latency = 3;
 }
 // CVTSI2SD.
 // x,r32/64.
@@ -1377,7 +1382,7 @@ defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>;
 //-- SSE4A instructions --//
 // EXTRQ
 def Zn2WriteEXTRQ: SchedWriteRes<[Zn2FPU12, Zn2FPU2]> {
-  let Latency = 2;
+  let Latency = 3;
 }
 def : InstRW<[Zn2WriteEXTRQ], (instregex "EXTRQ")>;
 
@@ -1449,12 +1454,6 @@ def : InstRW<[Zn2WriteSHA256RNDS2Ld], (instregex "SHA256RNDS2rm")>;
 
 //-- Arithmetic instructions --//
 
-// HADD, HSUB PS/PD
-def : SchedAlias<WriteFHAdd,    Zn2WriteMicrocoded>;
-def : SchedAlias<WriteFHAddLd,  Zn2WriteMicrocoded>;
-def : SchedAlias<WriteFHAddY,   Zn2WriteMicrocoded>;
-def : SchedAlias<WriteFHAddYLd, Zn2WriteMicrocoded>;
-
 // VDIVPS.
 // TODO - convert to Zn2WriteResFpuPair
 // y,y,y.
@@ -1491,11 +1490,9 @@ def : SchedAlias<WriteFDiv64YLd, Zn2WriteVDIVPDYLd>;
 
 // DPPS.
 // x,x,i / v,v,v,i.
-def : SchedAlias<WriteDPPS,   Zn2WriteMicrocoded>;
 def : SchedAlias<WriteDPPSY,  Zn2WriteMicrocoded>;
 
 // x,m,i / v,v,m,i.
-def : SchedAlias<WriteDPPSLd, Zn2WriteMicrocoded>;
 def : SchedAlias<WriteDPPSYLd,Zn2WriteMicrocoded>;
 
 // DPPD.
index 3b1290f..4bbe2cc 100644 (file)
@@ -1098,18 +1098,18 @@ vzeroupper
 # CHECK-NEXT:  1      8     0.50    *                   vbroadcastsd   (%rax), %ymm2
 # CHECK-NEXT:  1      8     0.50    *                   vbroadcastss   (%rax), %xmm2
 # CHECK-NEXT:  1      8     0.50    *                   vbroadcastss   (%rax), %ymm2
-# CHECK-NEXT:  1      3     1.00                        vcmpeqpd       %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   vcmpeqpd       (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vcmpeqpd       %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      10    1.00    *                   vcmpeqpd       (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vcmpeqps       %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   vcmpeqps       (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vcmpeqps       %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      10    1.00    *                   vcmpeqps       (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vcmpeqsd       %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   vcmpeqsd       (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vcmpeqss       %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   vcmpeqss       (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vcmpeqpd       %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1          1.00    *                   vcmpeqpd       (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vcmpeqpd       %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1          1.00    *                   vcmpeqpd       (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vcmpeqps       %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1          1.00    *                   vcmpeqps       (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vcmpeqps       %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1          1.00    *                   vcmpeqps       (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vcmpeqsd       %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1          1.00    *                   vcmpeqsd       (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vcmpeqss       %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1          1.00    *                   vcmpeqss       (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      3     1.00                        vcomisd        %xmm0, %xmm1
 # CHECK-NEXT:  1      10    1.00    *                   vcomisd        (%rax), %xmm1
 # CHECK-NEXT:  1      3     1.00                        vcomiss        %xmm0, %xmm1
@@ -1118,7 +1118,7 @@ vzeroupper
 # CHECK-NEXT:  1      12    1.00    *                   vcvtdq2pd      (%rax), %xmm2
 # CHECK-NEXT:  1      3     1.00                        vcvtdq2pd      %xmm0, %ymm2
 # CHECK-NEXT:  1      12    1.00    *                   vcvtdq2pd      (%rax), %ymm2
-# CHECK-NEXT:  1      5     1.00                        vcvtdq2ps      %xmm0, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vcvtdq2ps      %xmm0, %xmm2
 # CHECK-NEXT:  1      12    1.00    *                   vcvtdq2ps      (%rax), %xmm2
 # CHECK-NEXT:  1      3     1.00                        vcvtdq2ps      %ymm0, %ymm2
 # CHECK-NEXT:  1      12    1.00    *                   vcvtdq2ps      (%rax), %ymm2
@@ -1130,7 +1130,7 @@ vzeroupper
 # CHECK-NEXT:  2      10    0.50    *                   vcvtpd2psx     (%rax), %xmm2
 # CHECK-NEXT:  1      3     1.00                        vcvtpd2ps      %ymm0, %xmm2
 # CHECK-NEXT:  1      10    1.00    *                   vcvtpd2psy     (%rax), %xmm2
-# CHECK-NEXT:  1      5     1.00                        vcvtps2dq      %xmm0, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vcvtps2dq      %xmm0, %xmm2
 # CHECK-NEXT:  1      12    1.00    *                   vcvtps2dq      (%rax), %xmm2
 # CHECK-NEXT:  1      5     1.00                        vcvtps2dq      %ymm0, %ymm2
 # CHECK-NEXT:  1      12    1.00    *                   vcvtps2dq      (%rax), %ymm2
@@ -1144,8 +1144,8 @@ vzeroupper
 # CHECK-NEXT:  1      11    1.00    *                   vcvtsd2si      (%rax), %rcx
 # CHECK-NEXT:  1      3     1.00                        vcvtsd2ss      %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  2      10    0.50    *                   vcvtsd2ss      (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      4     1.00                        vcvtsi2sd      %ecx, %xmm0, %xmm2
-# CHECK-NEXT:  1      4     1.00                        vcvtsi2sd      %rcx, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vcvtsi2sd      %ecx, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vcvtsi2sd      %rcx, %xmm0, %xmm2
 # CHECK-NEXT:  1      12    1.00    *                   vcvtsi2sdl     (%rax), %xmm0, %xmm2
 # CHECK-NEXT:  1      12    1.00    *                   vcvtsi2sdq     (%rax), %xmm0, %xmm2
 # CHECK-NEXT:  1      5     1.00                        vcvtsi2ss      %ecx, %xmm0, %xmm2
@@ -1162,7 +1162,7 @@ vzeroupper
 # CHECK-NEXT:  2      10    1.00    *                   vcvttpd2dqx    (%rax), %xmm2
 # CHECK-NEXT:  1      3     1.00                        vcvttpd2dq     %ymm0, %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   vcvttpd2dqy    (%rax), %xmm2
-# CHECK-NEXT:  1      5     1.00                        vcvttps2dq     %xmm0, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vcvttps2dq     %xmm0, %xmm2
 # CHECK-NEXT:  1      12    1.00    *                   vcvttps2dq     (%rax), %xmm2
 # CHECK-NEXT:  1      5     1.00                        vcvttps2dq     %ymm0, %ymm2
 # CHECK-NEXT:  1      12    1.00    *                   vcvttps2dq     (%rax), %ymm2
@@ -1188,30 +1188,30 @@ vzeroupper
 # CHECK-NEXT:  1      22    1.00    *                   vdivss (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      100   0.25                        vdppd  $22, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      100   0.25    *                   vdppd  $22, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25                        vdpps  $22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   vdpps  $22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1   0.25                        vdpps  $22, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      19    0.33    *                   vdpps  $22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      100   0.25                        vdpps  $22, %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  1      100   0.25    *                   vdpps  $22, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.33                        vextractf128   $1, %ymm0, %xmm2
 # CHECK-NEXT:  2      8     0.33           *            vextractf128   $1, %ymm0, (%rax)
 # CHECK-NEXT:  1      2     2.00                        vextractps     $1, %xmm0, %ecx
 # CHECK-NEXT:  2      5     2.00           *            vextractps     $1, %xmm0, (%rax)
-# CHECK-NEXT:  1      100   0.25                        vhaddpd        %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   vhaddpd        (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25                        vhaddpd        %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      100   0.25    *                   vhaddpd        (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      100   0.25                        vhaddps        %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   vhaddps        (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25                        vhaddps        %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      100   0.25    *                   vhaddps        (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      100   0.25                        vhsubpd        %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   vhsubpd        (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25                        vhsubpd        %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      100   0.25    *                   vhsubpd        (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      100   0.25                        vhsubps        %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   vhsubps        (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25                        vhsubps        %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      100   0.25    *                   vhsubps        (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      7     0.25                        vhaddpd        %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    0.33    *                   vhaddpd        (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.25                        vhaddpd        %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      11    0.33    *                   vhaddpd        (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      7     0.25                        vhaddps        %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    0.33    *                   vhaddps        (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.25                        vhaddps        %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      11    0.33    *                   vhaddps        (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      7     0.25                        vhsubpd        %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    0.33    *                   vhsubpd        (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.25                        vhsubpd        %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      11    0.33    *                   vhsubpd        (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      7     0.25                        vhsubps        %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      11    0.33    *                   vhsubps        (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.25                        vhsubps        %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      11    0.33    *                   vhsubps        (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      2     0.33                        vinsertf128    $1, %xmm0, %ymm1, %ymm2
 # CHECK-NEXT:  2      9     0.33    *                   vinsertf128    $1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.50                        vinsertps      $1, %xmm0, %xmm1, %xmm2
@@ -1228,30 +1228,30 @@ vzeroupper
 # CHECK-NEXT:  2      8     0.50    *                   vmaskmovps     (%rax), %ymm0, %ymm2
 # CHECK-NEXT:  1      4     0.50    *      *            vmaskmovps     %xmm0, %xmm1, (%rax)
 # CHECK-NEXT:  2      5     1.00    *      *            vmaskmovps     %ymm0, %ymm1, (%rax)
-# CHECK-NEXT:  1      3     1.00                        vmaxpd %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   vmaxpd (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vmaxpd %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      10    1.00    *                   vmaxpd (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vmaxps %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   vmaxps (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vmaxps %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      10    1.00    *                   vmaxps (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vmaxsd %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   vmaxsd (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vmaxss %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   vmaxss (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vminpd %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   vminpd (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vminpd %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      10    1.00    *                   vminpd (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vminps %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   vminps (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vminps %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      10    1.00    *                   vminps (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vminsd %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   vminsd (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vminss %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   vminss (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vmaxpd %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1          1.00    *                   vmaxpd (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vmaxpd %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1          1.00    *                   vmaxpd (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vmaxps %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1          1.00    *                   vmaxps (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vmaxps %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1          1.00    *                   vmaxps (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vmaxsd %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1          1.00    *                   vmaxsd (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vmaxss %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1          1.00    *                   vmaxss (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vminpd %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1          1.00    *                   vminpd (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vminpd %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1          1.00    *                   vminpd (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vminps %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1          1.00    *                   vminps (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vminps %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1          1.00    *                   vminps (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vminsd %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1          1.00    *                   vminsd (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     1.00                        vminss %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1          1.00    *                   vminss (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      1     0.25                        vmovapd        %xmm0, %xmm2
 # CHECK-NEXT:  1      1     0.33           *            vmovapd        %xmm0, (%rax)
 # CHECK-NEXT:  1      8     0.33    *                   vmovapd        (%rax), %xmm2
@@ -1341,12 +1341,12 @@ vzeroupper
 # CHECK-NEXT:  1      100   0.25    *                   vmpsadbw       $1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      3     0.50                        vmulpd %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  2      10    0.50    *                   vmulpd (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      4     0.50                        vmulpd %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      11    0.50    *                   vmulpd (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     0.50                        vmulpd %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vmulpd (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      3     0.50                        vmulps %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  2      10    0.50    *                   vmulps (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      4     0.50                        vmulps %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      11    0.50    *                   vmulps (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     0.50                        vmulps %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    0.50    *                   vmulps (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      3     0.50                        vmulsd %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  2      10    0.50    *                   vmulsd (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      3     0.50                        vmulss %xmm0, %xmm1, %xmm2
@@ -1433,20 +1433,20 @@ vzeroupper
 # CHECK-NEXT:  1      100   0.25    *                   vperm2f128     $1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.50                        vpermilpd      $1, %xmm0, %xmm2
 # CHECK-NEXT:  1      8     0.50    *                   vpermilpd      $1, (%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpermilpd      %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1          0.50    *                   vpermilpd      (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpermilpd      %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vpermilpd      (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      1     0.50                        vpermilpd      $1, %ymm0, %ymm2
 # CHECK-NEXT:  1      8     0.50    *                   vpermilpd      $1, (%rax), %ymm2
-# CHECK-NEXT:  1      1     0.50                        vpermilpd      %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1          0.50    *                   vpermilpd      (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     0.50                        vpermilpd      %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vpermilpd      (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.50                        vpermilps      $1, %xmm0, %xmm2
 # CHECK-NEXT:  1      8     0.50    *                   vpermilps      $1, (%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpermilps      %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1          0.50    *                   vpermilps      (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpermilps      %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vpermilps      (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      1     0.50                        vpermilps      $1, %ymm0, %ymm2
 # CHECK-NEXT:  1      8     0.50    *                   vpermilps      $1, (%rax), %ymm2
-# CHECK-NEXT:  1      1     0.50                        vpermilps      %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1          0.50    *                   vpermilps      (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     0.50                        vpermilps      %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vpermilps      (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      2     2.00                        vpextrb        $1, %xmm0, %ecx
 # CHECK-NEXT:  2      5     3.00           *            vpextrb        $1, %xmm0, (%rax)
 # CHECK-NEXT:  1      2     2.00                        vpextrd        $1, %xmm0, %ecx
@@ -1455,20 +1455,20 @@ vzeroupper
 # CHECK-NEXT:  2      5     3.00           *            vpextrq        $1, %xmm0, (%rax)
 # CHECK-NEXT:  1      2     2.00                        vpextrw        $1, %xmm0, %ecx
 # CHECK-NEXT:  2      5     3.00           *            vpextrw        $1, %xmm0, (%rax)
-# CHECK-NEXT:  1      100   0.25                        vphaddd        %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   vphaddd        (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25                        vphaddsw       %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   vphaddsw       (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25                        vphaddw        %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   vphaddw        (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     0.25                        vphaddd        %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.33    *                   vphaddd        (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     0.25                        vphaddsw       %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.33    *                   vphaddsw       (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     0.25                        vphaddw        %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.33    *                   vphaddw        (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      4     1.00                        vphminposuw    %xmm0, %xmm2
 # CHECK-NEXT:  1      11    1.00    *                   vphminposuw    (%rax), %xmm2
-# CHECK-NEXT:  1      100   0.25                        vphsubd        %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   vphsubd        (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25                        vphsubsw       %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   vphsubsw       (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25                        vphsubw        %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   vphsubw        (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     0.25                        vphsubd        %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.33    *                   vphsubd        (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     0.25                        vphsubsw       %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.33    *                   vphsubsw       (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     0.25                        vphsubw        %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      7     0.33    *                   vphsubw        (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      1     0.25                        vpinsrb        $1, %eax, %xmm1, %xmm2
 # CHECK-NEXT:  1      8     0.33    *                   vpinsrb        $1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      1     0.25                        vpinsrd        $1, %eax, %xmm1, %xmm2
@@ -1565,7 +1565,7 @@ vzeroupper
 # CHECK-NEXT:  1      1     0.25                        vpslld $1, %xmm0, %xmm2
 # CHECK-NEXT:  1      1     1.00                        vpslld %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      8     1.00    *                   vpslld (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vpslldq        $1, %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vpslldq        $1, %xmm1, %xmm2
 # CHECK-NEXT:  1      1     0.25                        vpsllq $1, %xmm0, %xmm2
 # CHECK-NEXT:  1      1     1.00                        vpsllq %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      8     1.00    *                   vpsllq (%rax), %xmm1, %xmm2
@@ -1581,7 +1581,7 @@ vzeroupper
 # CHECK-NEXT:  1      1     0.25                        vpsrld $1, %xmm0, %xmm2
 # CHECK-NEXT:  1      1     1.00                        vpsrld %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      8     1.00    *                   vpsrld (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     1.00                        vpsrldq        $1, %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vpsrldq        $1, %xmm1, %xmm2
 # CHECK-NEXT:  1      1     0.25                        vpsrlq $1, %xmm0, %xmm2
 # CHECK-NEXT:  1      1     1.00                        vpsrlq %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      8     1.00    *                   vpsrlq (%rax), %xmm1, %xmm2
@@ -1632,18 +1632,18 @@ vzeroupper
 # CHECK-NEXT:  3      12    0.50    *                   vrcpps (%rax), %ymm2
 # CHECK-NEXT:  1      5     0.50                        vrcpss %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      12    0.50    *                   vrcpss (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      4     1.00                        vroundpd       $1, %xmm0, %xmm2
-# CHECK-NEXT:  1      11    1.00    *                   vroundpd       $1, (%rax), %xmm2
-# CHECK-NEXT:  1      4     1.00                        vroundpd       $1, %ymm0, %ymm2
-# CHECK-NEXT:  1      11    1.00    *                   vroundpd       $1, (%rax), %ymm2
-# CHECK-NEXT:  1      4     1.00                        vroundps       $1, %xmm0, %xmm2
-# CHECK-NEXT:  1      11    1.00    *                   vroundps       $1, (%rax), %xmm2
-# CHECK-NEXT:  1      4     1.00                        vroundps       $1, %ymm0, %ymm2
-# CHECK-NEXT:  1      11    1.00    *                   vroundps       $1, (%rax), %ymm2
-# CHECK-NEXT:  1      4     1.00                        vroundsd       $1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      11    1.00    *                   vroundsd       $1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      4     1.00                        vroundss       $1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      11    1.00    *                   vroundss       $1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vroundpd       $1, %xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vroundpd       $1, (%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        vroundpd       $1, %ymm0, %ymm2
+# CHECK-NEXT:  1      10    1.00    *                   vroundpd       $1, (%rax), %ymm2
+# CHECK-NEXT:  1      3     1.00                        vroundps       $1, %xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vroundps       $1, (%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        vroundps       $1, %ymm0, %ymm2
+# CHECK-NEXT:  1      10    1.00    *                   vroundps       $1, (%rax), %ymm2
+# CHECK-NEXT:  1      3     1.00                        vroundsd       $1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vroundsd       $1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        vroundss       $1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vroundss       $1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     0.50                        vrsqrtps       %xmm0, %xmm2
 # CHECK-NEXT:  2      12    0.50    *                   vrsqrtps       (%rax), %xmm2
 # CHECK-NEXT:  2      5     1.00                        vrsqrtps       %ymm0, %ymm2
@@ -1739,7 +1739,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
-# CHECK-NEXT: 112.00 112.00 112.00 0.25   0.25   0.25   0.25    -     191.92 141.92 168.75 455.42  -
+# CHECK-NEXT: 117.00 117.00 117.00 0.25   0.25   0.25   0.25    -     191.92 143.42 170.25 455.42  -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
@@ -1831,7 +1831,7 @@ vzeroupper
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -     1.00    -     vcvtdq2pd     (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     vcvtdq2pd     %xmm0, %ymm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -     1.00    -     vcvtdq2pd     (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     vcvtdq2ps     %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     vcvtdq2ps     %xmm0, %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -     1.00    -     vcvtdq2ps     (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     vcvtdq2ps     %ymm0, %ymm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -     1.00    -     vcvtdq2ps     (%rax), %ymm2
@@ -1843,7 +1843,7 @@ vzeroupper
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     0.50    -      -     0.50    -     vcvtpd2psx    (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     vcvtpd2ps     %ymm0, %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -     1.00    -     vcvtpd2psy    (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     vcvtps2dq     %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     vcvtps2dq     %xmm0, %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -     1.00    -     vcvtps2dq     (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     vcvtps2dq     %ymm0, %ymm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -     1.00    -     vcvtps2dq     (%rax), %ymm2
@@ -1875,7 +1875,7 @@ vzeroupper
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     0.50   0.50   1.00    -     vcvttpd2dqx   (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     vcvttpd2dq    %ymm0, %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     0.50   0.50   1.00    -     vcvttpd2dqy   (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     vcvttps2dq    %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     vcvttps2dq    %xmm0, %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -     1.00    -     vcvttps2dq    (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     vcvttps2dq    %ymm0, %ymm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -     1.00    -     vcvttps2dq    (%rax), %ymm2
@@ -1902,7 +1902,7 @@ vzeroupper
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vdppd $22, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vdppd $22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vdpps $22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vdpps $22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vdpps $22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vdpps $22, %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vdpps $22, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.33   0.33    -     0.33    -     vextractf128  $1, %ymm0, %xmm2
@@ -1910,21 +1910,21 @@ vzeroupper
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   2.50    -      -     vextractps    $1, %xmm0, %ecx
 # CHECK-NEXT: 1.67   1.67   1.67    -      -      -      -      -      -     0.50   2.50    -      -     vextractps    $1, %xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vhaddpd       %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vhaddpd       (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vhaddpd       (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vhaddpd       %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vhaddpd       (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vhaddpd       (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vhaddps       %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vhaddps       (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vhaddps       (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vhaddps       %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vhaddps       (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vhaddps       (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vhsubpd       %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vhsubpd       (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vhsubpd       (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vhsubpd       %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vhsubpd       (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vhsubpd       (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vhsubps       %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vhsubps       (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vhsubps       (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vhsubps       %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vhsubps       (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vhsubps       (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.33   0.33    -     0.33    -     vinsertf128   $1, %xmm0, %ymm1, %ymm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     0.33   0.33    -     0.33    -     vinsertf128   $1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     vinsertps     $1, %xmm0, %xmm1, %xmm2
@@ -2169,19 +2169,19 @@ vzeroupper
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   2.50    -      -     vpextrw       $1, %xmm0, %ecx
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     1.00   4.00    -      -     vpextrw       $1, %xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphaddd       %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphaddd       (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vphaddd       (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphaddsw      %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphaddsw      (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vphaddsw      (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphaddw       %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphaddw       (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vphaddw       (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -     vphminposuw   %xmm0, %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     1.00    -      -      -      -     vphminposuw   (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphsubd       %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphsubd       (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vphsubd       (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphsubsw      %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphsubsw      (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vphsubsw      (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphsubw       %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphsubw       (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vphsubw       (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -     vpinsrb       $1, %eax, %xmm1, %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     0.25   0.25   0.25   0.25    -     vpinsrb       $1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -     vpinsrd       $1, %eax, %xmm1, %xmm2
index fafb657..f958c1f 100644 (file)
@@ -576,18 +576,18 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      100   0.25    *                   vpgatherqd     %xmm0, (%rax,%ymm1,2), %xmm2
 # CHECK-NEXT:  1      100   0.25    *                   vpgatherqq     %xmm0, (%rax,%xmm1,2), %xmm2
 # CHECK-NEXT:  1      100   0.25    *                   vpgatherqq     %ymm0, (%rax,%ymm1,2), %ymm2
-# CHECK-NEXT:  1      100   0.25                        vphaddd        %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      100   0.25    *                   vphaddd        (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      100   0.25                        vphaddsw       %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      100   0.25    *                   vphaddsw       (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      100   0.25                        vphaddw        %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      100   0.25    *                   vphaddw        (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      100   0.25                        vphsubd        %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      100   0.25    *                   vphsubd        (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      100   0.25                        vphsubsw       %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      100   0.25    *                   vphsubsw       (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      100   0.25                        vphsubw        %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      100   0.25    *                   vphsubw        (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     0.25                        vphaddd        %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     0.33    *                   vphaddd        (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     0.25                        vphaddsw       %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     0.33    *                   vphaddsw       (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     0.25                        vphaddw        %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     0.33    *                   vphaddw        (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     0.25                        vphsubd        %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     0.33    *                   vphsubd        (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     0.25                        vphsubsw       %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     0.33    *                   vphsubsw       (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     0.25                        vphsubw        %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     0.33    *                   vphsubw        (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      4     1.00                        vpmaddubsw     %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  1      11    1.00    *                   vpmaddubsw     (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      4     1.00                        vpmaddwd       %ymm0, %ymm1, %ymm2
@@ -625,29 +625,29 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.25                        vpminuw        %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  1      8     0.33    *                   vpminuw        (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  2      2     2.00                        vpmovmskb      %ymm0, %ecx
-# CHECK-NEXT:  2      1     0.50                        vpmovsxbd      %xmm0, %ymm2
+# CHECK-NEXT:  2      4     0.50                        vpmovsxbd      %xmm0, %ymm2
 # CHECK-NEXT:  2      8     0.50    *                   vpmovsxbd      (%rax), %ymm2
-# CHECK-NEXT:  2      1     0.50                        vpmovsxbq      %xmm0, %ymm2
+# CHECK-NEXT:  2      4     0.50                        vpmovsxbq      %xmm0, %ymm2
 # CHECK-NEXT:  2      8     0.50    *                   vpmovsxbq      (%rax), %ymm2
-# CHECK-NEXT:  2      1     0.50                        vpmovsxbw      %xmm0, %ymm2
+# CHECK-NEXT:  2      4     0.50                        vpmovsxbw      %xmm0, %ymm2
 # CHECK-NEXT:  2      8     0.50    *                   vpmovsxbw      (%rax), %ymm2
-# CHECK-NEXT:  2      1     0.50                        vpmovsxdq      %xmm0, %ymm2
+# CHECK-NEXT:  2      4     0.50                        vpmovsxdq      %xmm0, %ymm2
 # CHECK-NEXT:  2      8     0.50    *                   vpmovsxdq      (%rax), %ymm2
-# CHECK-NEXT:  2      1     0.50                        vpmovsxwd      %xmm0, %ymm2
+# CHECK-NEXT:  2      4     0.50                        vpmovsxwd      %xmm0, %ymm2
 # CHECK-NEXT:  2      8     0.50    *                   vpmovsxwd      (%rax), %ymm2
-# CHECK-NEXT:  2      1     0.50                        vpmovsxwq      %xmm0, %ymm2
+# CHECK-NEXT:  2      4     0.50                        vpmovsxwq      %xmm0, %ymm2
 # CHECK-NEXT:  2      8     0.50    *                   vpmovsxwq      (%rax), %ymm2
-# CHECK-NEXT:  2      1     0.50                        vpmovzxbd      %xmm0, %ymm2
+# CHECK-NEXT:  2      4     0.50                        vpmovzxbd      %xmm0, %ymm2
 # CHECK-NEXT:  2      8     0.50    *                   vpmovzxbd      (%rax), %ymm2
-# CHECK-NEXT:  2      1     0.50                        vpmovzxbq      %xmm0, %ymm2
+# CHECK-NEXT:  2      4     0.50                        vpmovzxbq      %xmm0, %ymm2
 # CHECK-NEXT:  2      8     0.50    *                   vpmovzxbq      (%rax), %ymm2
-# CHECK-NEXT:  2      1     0.50                        vpmovzxbw      %xmm0, %ymm2
+# CHECK-NEXT:  2      4     0.50                        vpmovzxbw      %xmm0, %ymm2
 # CHECK-NEXT:  2      8     0.50    *                   vpmovzxbw      (%rax), %ymm2
-# CHECK-NEXT:  2      1     0.50                        vpmovzxdq      %xmm0, %ymm2
+# CHECK-NEXT:  2      4     0.50                        vpmovzxdq      %xmm0, %ymm2
 # CHECK-NEXT:  2      8     0.50    *                   vpmovzxdq      (%rax), %ymm2
-# CHECK-NEXT:  2      1     0.50                        vpmovzxwd      %xmm0, %ymm2
+# CHECK-NEXT:  2      4     0.50                        vpmovzxwd      %xmm0, %ymm2
 # CHECK-NEXT:  2      8     0.50    *                   vpmovzxwd      (%rax), %ymm2
-# CHECK-NEXT:  2      1     0.50                        vpmovzxwq      %xmm0, %ymm2
+# CHECK-NEXT:  2      4     0.50                        vpmovzxwq      %xmm0, %ymm2
 # CHECK-NEXT:  2      8     0.50    *                   vpmovzxwq      (%rax), %ymm2
 # CHECK-NEXT:  1      4     1.00                        vpmuldq        %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  1      11    1.00    *                   vpmuldq        (%rax), %ymm1, %ymm2
@@ -657,8 +657,8 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      11    1.00    *                   vpmulhuw       (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      4     1.00                        vpmulhw        %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  1      11    1.00    *                   vpmulhw        (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vpmulld        %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      10    1.00    *                   vpmulld        (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      4     1.00                        vpmulld        %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      11    1.00    *                   vpmulld        (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      4     1.00                        vpmullw        %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  1      11    1.00    *                   vpmullw        (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      4     1.00                        vpmuludq       %ymm0, %ymm1, %ymm2
@@ -682,51 +682,51 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.25                        vpsignw        %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  1      8     0.33    *                   vpsignw        (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.25                        vpslld $1, %ymm0, %ymm2
-# CHECK-NEXT:  1      2     1.00                        vpslld %xmm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      9     1.00    *                   vpslld (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vpslldq        $1, %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vpslld %xmm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      8     1.00    *                   vpslld (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     1.00                        vpslldq        $1, %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.25                        vpsllq $1, %ymm0, %ymm2
-# CHECK-NEXT:  1      2     1.00                        vpsllq %xmm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      9     1.00    *                   vpsllq (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     0.50                        vpsllvd        %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1          0.50    *                   vpsllvd        (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpsllvd        %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1          0.50    *                   vpsllvd        (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     0.50                        vpsllvq        %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1          0.50    *                   vpsllvq        (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpsllvq        %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1          0.50    *                   vpsllvq        (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vpsllq %xmm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      8     1.00    *                   vpsllq (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     0.50                        vpsllvd        %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vpsllvd        (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpsllvd        %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vpsllvd        (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     0.50                        vpsllvq        %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vpsllvq        (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpsllvq        %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vpsllvq        (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.25                        vpsllw $1, %ymm0, %ymm2
-# CHECK-NEXT:  1      2     1.00                        vpsllw %xmm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      9     1.00    *                   vpsllw (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vpsllw %xmm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      8     1.00    *                   vpsllw (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.25                        vpsrad $1, %ymm0, %ymm2
-# CHECK-NEXT:  1      2     1.00                        vpsrad %xmm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      9     1.00    *                   vpsrad (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     0.50                        vpsravd        %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1          0.50    *                   vpsravd        (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpsravd        %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1          0.50    *                   vpsravd        (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vpsrad %xmm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      8     1.00    *                   vpsrad (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     0.50                        vpsravd        %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vpsravd        (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpsravd        %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vpsravd        (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.25                        vpsraw $1, %ymm0, %ymm2
-# CHECK-NEXT:  1      2     1.00                        vpsraw %xmm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      9     1.00    *                   vpsraw (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vpsraw %xmm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      8     1.00    *                   vpsraw (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.25                        vpsrld $1, %ymm0, %ymm2
-# CHECK-NEXT:  1      2     1.00                        vpsrld %xmm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      9     1.00    *                   vpsrld (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     1.00                        vpsrldq        $1, %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vpsrld %xmm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      8     1.00    *                   vpsrld (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     1.00                        vpsrldq        $1, %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.25                        vpsrlq $1, %ymm0, %ymm2
-# CHECK-NEXT:  1      2     1.00                        vpsrlq %xmm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      9     1.00    *                   vpsrlq (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     0.50                        vpsrlvd        %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1          0.50    *                   vpsrlvd        (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpsrlvd        %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1          0.50    *                   vpsrlvd        (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      1     0.50                        vpsrlvq        %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1          0.50    *                   vpsrlvq        (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vpsrlvq        %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  1          0.50    *                   vpsrlvq        (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vpsrlq %xmm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      8     1.00    *                   vpsrlq (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     0.50                        vpsrlvd        %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vpsrlvd        (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpsrlvd        %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vpsrlvd        (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      3     0.50                        vpsrlvq        %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vpsrlvq        (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpsrlvq        %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      10    0.50    *                   vpsrlvq        (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.25                        vpsrlw $1, %ymm0, %ymm2
-# CHECK-NEXT:  1      2     1.00                        vpsrlw %xmm0, %ymm1, %ymm2
-# CHECK-NEXT:  1      9     1.00    *                   vpsrlw (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      1     1.00                        vpsrlw %xmm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      8     1.00    *                   vpsrlw (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.25                        vpsubb %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  1      8     0.33    *                   vpsubb (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.25                        vpsubd %ymm0, %ymm1, %ymm2
@@ -779,7 +779,7 @@ vpxor           (%rax), %ymm1, %ymm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
-# CHECK-NEXT: 42.67  42.67  42.67   -      -      -      -      -     70.17  75.17  85.00  42.67   -
+# CHECK-NEXT: 44.67  44.67  44.67   -      -      -      -      -     70.17  75.17  85.00  42.67   -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
@@ -900,17 +900,17 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vpgatherqq    %xmm0, (%rax,%xmm1,2), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vpgatherqq    %ymm0, (%rax,%ymm1,2), %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphaddd       %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphaddd       (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vphaddd       (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphaddsw      %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphaddsw      (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vphaddsw      (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphaddw       %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphaddw       (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vphaddw       (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphsubd       %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphsubd       (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vphsubd       (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphsubsw      %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphsubsw      (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vphsubsw      (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphsubw       %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     vphsubw       (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     vphsubw       (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -     vpmaddubsw    %ymm0, %ymm1, %ymm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     1.00    -      -      -      -     vpmaddubsw    (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -     vpmaddwd      %ymm0, %ymm1, %ymm2
index 496176e..d25b1a7 100644 (file)
@@ -202,10 +202,10 @@ xorps       (%rax), %xmm2
 # CHECK-NEXT:  1      8     0.33    *                   andnps (%rax), %xmm2
 # CHECK-NEXT:  1      1     0.25                        andps  %xmm0, %xmm2
 # CHECK-NEXT:  1      8     0.33    *                   andps  (%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        cmpeqps        %xmm0, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   cmpeqps        (%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        cmpeqss        %xmm0, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   cmpeqss        (%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        cmpeqps        %xmm0, %xmm2
+# CHECK-NEXT:  1          1.00    *                   cmpeqps        (%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        cmpeqss        %xmm0, %xmm2
+# CHECK-NEXT:  1          1.00    *                   cmpeqss        (%rax), %xmm2
 # CHECK-NEXT:  1      3     1.00                        comiss %xmm0, %xmm1
 # CHECK-NEXT:  1      10    1.00    *                   comiss (%rax), %xmm1
 # CHECK-NEXT:  1      5     1.00                        cvtpi2ps       %mm0, %xmm2
@@ -232,14 +232,14 @@ xorps       (%rax), %xmm2
 # CHECK-NEXT:  1      22    1.00    *                   divss  (%rax), %xmm2
 # CHECK-NEXT:  1      100   0.25    *             U     ldmxcsr        (%rax)
 # CHECK-NEXT:  1      100   0.25    *      *      U     maskmovq       %mm0, %mm1
-# CHECK-NEXT:  1      3     1.00                        maxps  %xmm0, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   maxps  (%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        maxss  %xmm0, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   maxss  (%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        minps  %xmm0, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   minps  (%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        minss  %xmm0, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   minss  (%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        maxps  %xmm0, %xmm2
+# CHECK-NEXT:  1          1.00    *                   maxps  (%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        maxss  %xmm0, %xmm2
+# CHECK-NEXT:  1          1.00    *                   maxss  (%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        minps  %xmm0, %xmm2
+# CHECK-NEXT:  1          1.00    *                   minps  (%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        minss  %xmm0, %xmm2
+# CHECK-NEXT:  1          1.00    *                   minss  (%rax), %xmm2
 # CHECK-NEXT:  1      1     0.25                        movaps %xmm0, %xmm2
 # CHECK-NEXT:  1      1     0.33           *            movaps %xmm0, (%rax)
 # CHECK-NEXT:  1      8     0.33    *                   movaps (%rax), %xmm2
index c369915..8dc689b 100644 (file)
@@ -416,15 +416,15 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  1      1     0.25                        andpd  %xmm0, %xmm2
 # CHECK-NEXT:  1      8     0.33    *                   andpd  (%rax), %xmm2
 # CHECK-NEXT:  1      8     0.33    *      *      U     clflush        (%rax)
-# CHECK-NEXT:  1      3     1.00                        cmpeqpd        %xmm0, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   cmpeqpd        (%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        cmpeqsd        %xmm0, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   cmpeqsd        (%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        cmpeqpd        %xmm0, %xmm2
+# CHECK-NEXT:  1          1.00    *                   cmpeqpd        (%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        cmpeqsd        %xmm0, %xmm2
+# CHECK-NEXT:  1          1.00    *                   cmpeqsd        (%rax), %xmm2
 # CHECK-NEXT:  1      3     1.00                        comisd %xmm0, %xmm1
 # CHECK-NEXT:  1      10    1.00    *                   comisd (%rax), %xmm1
 # CHECK-NEXT:  1      3     1.00                        cvtdq2pd       %xmm0, %xmm2
 # CHECK-NEXT:  1      12    1.00    *                   cvtdq2pd       (%rax), %xmm2
-# CHECK-NEXT:  1      5     1.00                        cvtdq2ps       %xmm0, %xmm2
+# CHECK-NEXT:  1      3     1.00                        cvtdq2ps       %xmm0, %xmm2
 # CHECK-NEXT:  1      12    1.00    *                   cvtdq2ps       (%rax), %xmm2
 # CHECK-NEXT:  1      3     1.00                        cvtpd2dq       %xmm0, %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   cvtpd2dq       (%rax), %xmm2
@@ -434,7 +434,7 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  2      10    0.50    *                   cvtpd2ps       (%rax), %xmm2
 # CHECK-NEXT:  1      3     1.00                        cvtpi2pd       %mm0, %xmm2
 # CHECK-NEXT:  1      12    1.00    *                   cvtpi2pd       (%rax), %xmm2
-# CHECK-NEXT:  1      5     1.00                        cvtps2dq       %xmm0, %xmm2
+# CHECK-NEXT:  1      3     1.00                        cvtps2dq       %xmm0, %xmm2
 # CHECK-NEXT:  1      12    1.00    *                   cvtps2dq       (%rax), %xmm2
 # CHECK-NEXT:  1      3     1.00                        cvtps2pd       %xmm0, %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   cvtps2pd       (%rax), %xmm2
@@ -444,8 +444,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  1      11    1.00    *                   cvtsd2si       (%rax), %rcx
 # CHECK-NEXT:  1      3     1.00                        cvtsd2ss       %xmm0, %xmm2
 # CHECK-NEXT:  2      10    0.50    *                   cvtsd2ss       (%rax), %xmm2
-# CHECK-NEXT:  1      4     1.00                        cvtsi2sd       %ecx, %xmm2
-# CHECK-NEXT:  1      4     1.00                        cvtsi2sd       %rcx, %xmm2
+# CHECK-NEXT:  1      3     1.00                        cvtsi2sd       %ecx, %xmm2
+# CHECK-NEXT:  1      3     1.00                        cvtsi2sd       %rcx, %xmm2
 # CHECK-NEXT:  1      12    1.00    *                   cvtsi2sdl      (%rax), %xmm2
 # CHECK-NEXT:  1      12    1.00    *                   cvtsi2sdl      (%rax), %xmm2
 # CHECK-NEXT:  1      3     1.00                        cvtss2sd       %xmm0, %xmm2
@@ -454,7 +454,7 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   cvttpd2dq      (%rax), %xmm2
 # CHECK-NEXT:  1      4     1.00                        cvttpd2pi      %xmm0, %mm2
 # CHECK-NEXT:  1      12    1.00    *                   cvttpd2pi      (%rax), %mm2
-# CHECK-NEXT:  1      5     1.00                        cvttps2dq      %xmm0, %xmm2
+# CHECK-NEXT:  1      3     1.00                        cvttps2dq      %xmm0, %xmm2
 # CHECK-NEXT:  1      12    1.00    *                   cvttps2dq      (%rax), %xmm2
 # CHECK-NEXT:  1      4     1.00                        cvttsd2si      %xmm0, %ecx
 # CHECK-NEXT:  1      4     1.00                        cvttsd2si      %xmm0, %rcx
@@ -466,15 +466,15 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  1      22    1.00    *                   divsd  (%rax), %xmm2
 # CHECK-NEXT:  1      1     0.33    *      *      U     lfence
 # CHECK-NEXT:  1      100   0.25    *      *      U     maskmovdqu     %xmm0, %xmm1
-# CHECK-NEXT:  1      3     1.00                        maxpd  %xmm0, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   maxpd  (%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        maxsd  %xmm0, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   maxsd  (%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        maxpd  %xmm0, %xmm2
+# CHECK-NEXT:  1          1.00    *                   maxpd  (%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        maxsd  %xmm0, %xmm2
+# CHECK-NEXT:  1          1.00    *                   maxsd  (%rax), %xmm2
 # CHECK-NEXT:  1      1     0.33    *      *      U     mfence
-# CHECK-NEXT:  1      3     1.00                        minpd  %xmm0, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   minpd  (%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        minsd  %xmm0, %xmm2
-# CHECK-NEXT:  1      10    1.00    *                   minsd  (%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        minpd  %xmm0, %xmm2
+# CHECK-NEXT:  1          1.00    *                   minpd  (%rax), %xmm2
+# CHECK-NEXT:  1      1     1.00                        minsd  %xmm0, %xmm2
+# CHECK-NEXT:  1          1.00    *                   minsd  (%rax), %xmm2
 # CHECK-NEXT:  1      1     0.25                        movapd %xmm0, %xmm2
 # CHECK-NEXT:  1      1     0.33           *            movapd %xmm0, (%rax)
 # CHECK-NEXT:  1      8     0.33    *                   movapd (%rax), %xmm2
@@ -597,7 +597,7 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  1      1     0.25                        pslld  $1, %xmm2
 # CHECK-NEXT:  1      1     1.00                        pslld  %xmm0, %xmm2
 # CHECK-NEXT:  1      8     1.00    *                   pslld  (%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        pslldq $1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        pslldq $1, %xmm2
 # CHECK-NEXT:  1      1     0.25                        psllq  $1, %xmm2
 # CHECK-NEXT:  1      1     1.00                        psllq  %xmm0, %xmm2
 # CHECK-NEXT:  1      8     1.00    *                   psllq  (%rax), %xmm2
@@ -613,7 +613,7 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  1      1     0.25                        psrld  $1, %xmm2
 # CHECK-NEXT:  1      1     1.00                        psrld  %xmm0, %xmm2
 # CHECK-NEXT:  1      8     1.00    *                   psrld  (%rax), %xmm2
-# CHECK-NEXT:  1      1     1.00                        psrldq $1, %xmm2
+# CHECK-NEXT:  1      3     1.00                        psrldq $1, %xmm2
 # CHECK-NEXT:  1      1     0.25                        psrlq  $1, %xmm2
 # CHECK-NEXT:  1      1     1.00                        psrlq  %xmm0, %xmm2
 # CHECK-NEXT:  1      8     1.00    *                   psrlq  (%rax), %xmm2
@@ -692,7 +692,7 @@ xorpd       (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
-# CHECK-NEXT: 44.33  44.33  44.33   -      -      -      -      -     71.92  40.42  71.75  152.92  -
+# CHECK-NEXT: 44.33  44.33  44.33   -      -      -      -      -     71.92  41.92  73.25  152.92  -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
@@ -713,7 +713,7 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     1.00    -      -      -      -     comisd        (%rax), %xmm1
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     cvtdq2pd      %xmm0, %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -     1.00    -     cvtdq2pd      (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     cvtdq2ps      %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     cvtdq2ps      %xmm0, %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -     1.00    -     cvtdq2ps      (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     cvtpd2dq      %xmm0, %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     0.50   0.50   1.00    -     cvtpd2dq      (%rax), %xmm2
@@ -723,7 +723,7 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     0.50    -      -     0.50    -     cvtpd2ps      (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     cvtpi2pd      %mm0, %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -     1.00    -     cvtpi2pd      (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     cvtps2dq      %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     cvtps2dq      %xmm0, %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -     1.00    -     cvtps2dq      (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     cvtps2pd      %xmm0, %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -     1.00    -     cvtps2pd      (%rax), %xmm2
@@ -743,7 +743,7 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     0.50   0.50   1.00    -     cvttpd2dq     (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     cvttpd2pi     %xmm0, %mm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -     1.00    -     cvttpd2pi     (%rax), %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     cvttps2dq     %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50   1.00    -     cvttps2dq     %xmm0, %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -     1.00    -     cvttps2dq     (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00   1.00    -     cvttsd2si     %xmm0, %ecx
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00   1.00    -     cvttsd2si     %xmm0, %rcx
index aefd015..5419f06 100644 (file)
@@ -47,14 +47,14 @@ mwait
 # CHECK-NEXT:  1      10    1.00    *                   addsubpd       (%rax), %xmm2
 # CHECK-NEXT:  1      3     1.00                        addsubps       %xmm0, %xmm2
 # CHECK-NEXT:  1      10    1.00    *                   addsubps       (%rax), %xmm2
-# CHECK-NEXT:  1      100   0.25                        haddpd %xmm0, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   haddpd (%rax), %xmm2
-# CHECK-NEXT:  1      100   0.25                        haddps %xmm0, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   haddps (%rax), %xmm2
-# CHECK-NEXT:  1      100   0.25                        hsubpd %xmm0, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   hsubpd (%rax), %xmm2
-# CHECK-NEXT:  1      100   0.25                        hsubps %xmm0, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   hsubps (%rax), %xmm2
+# CHECK-NEXT:  1      7     0.25                        haddpd %xmm0, %xmm2
+# CHECK-NEXT:  2      11    0.33    *                   haddpd (%rax), %xmm2
+# CHECK-NEXT:  1      7     0.25                        haddps %xmm0, %xmm2
+# CHECK-NEXT:  2      11    0.33    *                   haddps (%rax), %xmm2
+# CHECK-NEXT:  1      7     0.25                        hsubpd %xmm0, %xmm2
+# CHECK-NEXT:  2      11    0.33    *                   hsubpd (%rax), %xmm2
+# CHECK-NEXT:  1      7     0.25                        hsubps %xmm0, %xmm2
+# CHECK-NEXT:  2      11    0.33    *                   hsubps (%rax), %xmm2
 # CHECK-NEXT:  1      8     0.33    *                   lddqu  (%rax), %xmm2
 # CHECK-NEXT:  1      100   0.25                  U     monitor
 # CHECK-NEXT:  1      1     0.50                        movddup        %xmm0, %xmm2
@@ -82,7 +82,7 @@ mwait
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
-# CHECK-NEXT: 1.67   1.67   1.67    -      -      -      -      -     4.00   2.00   2.00    -      -
+# CHECK-NEXT: 3.00   3.00   3.00    -      -      -      -      -     4.00   2.00   2.00    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
@@ -91,13 +91,13 @@ mwait
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -     addsubps      %xmm0, %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     1.00    -      -      -      -     addsubps      (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     haddpd        %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     haddpd        (%rax), %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     haddpd        (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     haddps        %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     haddps        (%rax), %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     haddps        (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     hsubpd        %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     hsubpd        (%rax), %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     hsubpd        (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     hsubps        %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     hsubps        (%rax), %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     hsubps        (%rax), %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     lddqu (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     monitor
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     movddup       %xmm0, %xmm2
index 9005ec3..fb26a43 100644 (file)
@@ -165,8 +165,8 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  1      8     0.50    *                   blendvps       %xmm0, (%rax), %xmm2
 # CHECK-NEXT:  1      100   0.25                        dppd   $22, %xmm0, %xmm2
 # CHECK-NEXT:  1      100   0.25    *                   dppd   $22, (%rax), %xmm2
-# CHECK-NEXT:  1      100   0.25                        dpps   $22, %xmm0, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   dpps   $22, (%rax), %xmm2
+# CHECK-NEXT:  1      1   0.25                        dpps   $22, %xmm0, %xmm2
+# CHECK-NEXT:  2      19    0.33    *                   dpps   $22, (%rax), %xmm2
 # CHECK-NEXT:  1      2     2.00                        extractps      $1, %xmm0, %ecx
 # CHECK-NEXT:  2      5     2.00           *            extractps      $1, %xmm0, (%rax)
 # CHECK-NEXT:  1      1     0.50                        insertps       $1, %xmm0, %xmm2
@@ -243,14 +243,14 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  2      11    1.00    *                   pmulld (%rax), %xmm2
 # CHECK-NEXT:  1      1     1.00                        ptest  %xmm0, %xmm1
 # CHECK-NEXT:  2      8     1.00    *                   ptest  (%rax), %xmm1
-# CHECK-NEXT:  1      4     1.00                        roundpd        $1, %xmm0, %xmm2
-# CHECK-NEXT:  1      11    1.00    *                   roundpd        $1, (%rax), %xmm2
-# CHECK-NEXT:  1      4     1.00                        roundps        $1, %xmm0, %xmm2
-# CHECK-NEXT:  1      11    1.00    *                   roundps        $1, (%rax), %xmm2
-# CHECK-NEXT:  1      4     1.00                        roundsd        $1, %xmm0, %xmm2
-# CHECK-NEXT:  1      11    1.00    *                   roundsd        $1, (%rax), %xmm2
-# CHECK-NEXT:  1      4     1.00                        roundss        $1, %xmm0, %xmm2
-# CHECK-NEXT:  1      11    1.00    *                   roundss        $1, (%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        roundpd        $1, %xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   roundpd        $1, (%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        roundps        $1, %xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   roundps        $1, (%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        roundsd        $1, %xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   roundsd        $1, (%rax), %xmm2
+# CHECK-NEXT:  1      3     1.00                        roundss        $1, %xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   roundss        $1, (%rax), %xmm2
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn2AGU0
@@ -269,7 +269,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
-# CHECK-NEXT: 16.67  16.67  16.67   -      -      -      -      -     25.17  26.67  44.00  21.17   -
+# CHECK-NEXT: 17.00  17.00  17.00   -      -      -      -      -     25.17  26.67  44.00  21.17   -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
@@ -284,7 +284,7 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     dppd  $22, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     dppd  $22, (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     dpps  $22, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     dpps  $22, (%rax), %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     dpps  $22, (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   2.50    -      -     extractps     $1, %xmm0, %ecx
 # CHECK-NEXT: 1.67   1.67   1.67    -      -      -      -      -      -     0.50   2.50    -      -     extractps     $1, %xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     insertps      $1, %xmm0, %xmm2
index 332aaaf..91c2133 100644 (file)
@@ -19,8 +19,8 @@ movntss     %xmm0, (%rax)
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      2     1.00                        extrq  %xmm0, %xmm2
-# CHECK-NEXT:  1      2     1.00                        extrq  $22, $2, %xmm2
+# CHECK-NEXT:  1      3     1.00                        extrq  %xmm0, %xmm2
+# CHECK-NEXT:  1      3     1.00                        extrq  $22, $2, %xmm2
 # CHECK-NEXT:  1      4     1.00                        insertq        %xmm0, %xmm2
 # CHECK-NEXT:  1      4     1.00                        insertq        $22, $22, %xmm0, %xmm2
 # CHECK-NEXT:  1      8     1.00           *            movntsd        %xmm0, (%rax)
index 8b60106..cdcc493 100644 (file)
@@ -122,30 +122,30 @@ psignw      (%rax), %xmm2
 # CHECK-NEXT:  1      8     0.33    *                   palignr        $1, (%rax), %mm2
 # CHECK-NEXT:  1      1     0.25                        palignr        $1, %xmm0, %xmm2
 # CHECK-NEXT:  1      8     0.33    *                   palignr        $1, (%rax), %xmm2
-# CHECK-NEXT:  1      100   0.25                        phaddd %mm0, %mm2
-# CHECK-NEXT:  1      100   0.25    *                   phaddd (%rax), %mm2
-# CHECK-NEXT:  1      100   0.25                        phaddd %xmm0, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   phaddd (%rax), %xmm2
-# CHECK-NEXT:  1      100   0.25                        phaddsw        %mm0, %mm2
-# CHECK-NEXT:  1      100   0.25    *                   phaddsw        (%rax), %mm2
-# CHECK-NEXT:  1      100   0.25                        phaddsw        %xmm0, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   phaddsw        (%rax), %xmm2
-# CHECK-NEXT:  1      100   0.25                        phaddw %mm0, %mm2
-# CHECK-NEXT:  1      100   0.25    *                   phaddw (%rax), %mm2
-# CHECK-NEXT:  1      100   0.25                        phaddw %xmm0, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   phaddw (%rax), %xmm2
-# CHECK-NEXT:  1      100   0.25                        phsubd %mm0, %mm2
-# CHECK-NEXT:  1      100   0.25    *                   phsubd (%rax), %mm2
-# CHECK-NEXT:  1      100   0.25                        phsubd %xmm0, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   phsubd (%rax), %xmm2
-# CHECK-NEXT:  1      100   0.25                        phsubsw        %mm0, %mm2
-# CHECK-NEXT:  1      100   0.25    *                   phsubsw        (%rax), %mm2
-# CHECK-NEXT:  1      100   0.25                        phsubsw        %xmm0, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   phsubsw        (%rax), %xmm2
-# CHECK-NEXT:  1      100   0.25                        phsubw %mm0, %mm2
-# CHECK-NEXT:  1      100   0.25    *                   phsubw (%rax), %mm2
-# CHECK-NEXT:  1      100   0.25                        phsubw %xmm0, %xmm2
-# CHECK-NEXT:  1      100   0.25    *                   phsubw (%rax), %xmm2
+# CHECK-NEXT:  1      3     0.25                        phaddd %mm0, %mm2
+# CHECK-NEXT:  2      7     0.33    *                   phaddd (%rax), %mm2
+# CHECK-NEXT:  1      3     0.25                        phaddd %xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.33    *                   phaddd (%rax), %xmm2
+# CHECK-NEXT:  1      3     0.25                        phaddsw        %mm0, %mm2
+# CHECK-NEXT:  2      7     0.33    *                   phaddsw        (%rax), %mm2
+# CHECK-NEXT:  1      3     0.25                        phaddsw        %xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.33    *                   phaddsw        (%rax), %xmm2
+# CHECK-NEXT:  1      3     0.25                        phaddw %mm0, %mm2
+# CHECK-NEXT:  2      7     0.33    *                   phaddw (%rax), %mm2
+# CHECK-NEXT:  1      3     0.25                        phaddw %xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.33    *                   phaddw (%rax), %xmm2
+# CHECK-NEXT:  1      3     0.25                        phsubd %mm0, %mm2
+# CHECK-NEXT:  2      7     0.33    *                   phsubd (%rax), %mm2
+# CHECK-NEXT:  1      3     0.25                        phsubd %xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.33    *                   phsubd (%rax), %xmm2
+# CHECK-NEXT:  1      3     0.25                        phsubsw        %mm0, %mm2
+# CHECK-NEXT:  2      7     0.33    *                   phsubsw        (%rax), %mm2
+# CHECK-NEXT:  1      3     0.25                        phsubsw        %xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.33    *                   phsubsw        (%rax), %xmm2
+# CHECK-NEXT:  1      3     0.25                        phsubw %mm0, %mm2
+# CHECK-NEXT:  2      7     0.33    *                   phsubw (%rax), %mm2
+# CHECK-NEXT:  1      3     0.25                        phsubw %xmm0, %xmm2
+# CHECK-NEXT:  2      7     0.33    *                   phsubw (%rax), %xmm2
 # CHECK-NEXT:  1      4     1.00                        pmaddubsw      %mm0, %mm2
 # CHECK-NEXT:  1      11    1.00    *                   pmaddubsw      (%rax), %mm2
 # CHECK-NEXT:  1      4     1.00                        pmaddubsw      %xmm0, %xmm2
@@ -188,7 +188,7 @@ psignw      (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
-# CHECK-NEXT: 6.67   6.67   6.67    -      -      -      -      -     16.00  8.00   8.00   8.00    -
+# CHECK-NEXT: 10.67  10.67  10.67   -      -      -      -      -     16.00  8.00   8.00   8.00    -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
@@ -209,29 +209,29 @@ psignw      (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -     palignr       $1, %xmm0, %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     0.25   0.25   0.25   0.25    -     palignr       $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phaddd        %mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phaddd        (%rax), %mm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     phaddd        (%rax), %mm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phaddd        %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phaddd        (%rax), %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     phaddd        (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phaddsw       %mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phaddsw       (%rax), %mm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     phaddsw       (%rax), %mm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phaddsw       %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phaddsw       (%rax), %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     phaddsw       (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phaddw        %mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phaddw        (%rax), %mm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     phaddw        (%rax), %mm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phaddw        %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phaddw        (%rax), %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     phaddw        (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phsubd        %mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phsubd        (%rax), %mm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     phsubd        (%rax), %mm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phsubd        %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phsubd        (%rax), %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     phsubd        (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phsubsw       %mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phsubsw       (%rax), %mm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     phsubsw       (%rax), %mm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phsubsw       %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phsubsw       (%rax), %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     phsubsw       (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phsubw        %mm0, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phsubw        (%rax), %mm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     phsubw        (%rax), %mm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phsubw        %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     phsubw        (%rax), %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -     phsubw        (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -     pmaddubsw     %mm0, %mm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     1.00    -      -      -      -     pmaddubsw     (%rax), %mm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -     pmaddubsw     %xmm0, %xmm2
index 355f3bc..8849e32 100644 (file)
@@ -1002,17 +1002,17 @@ xorq (%rax), %rdi
 # CHECK-NEXT:  2      5     0.33    *      *            andq   %rsi, (%rax)
 # CHECK-NEXT:  2      5     0.33    *                   andq   (%rax), %rdi
 # CHECK-NEXT:  1      3     0.25                        bsfw   %si, %di
-# CHECK-NEXT:  1      3     0.25                        bsrw   %si, %di
+# CHECK-NEXT:  1      4     0.25                        bsrw   %si, %di
 # CHECK-NEXT:  2      7     0.33    *                   bsfw   (%rax), %di
-# CHECK-NEXT:  2      7     0.33    *                   bsrw   (%rax), %di
+# CHECK-NEXT:  2      8     0.33    *                   bsrw   (%rax), %di
 # CHECK-NEXT:  1      3     0.25                        bsfl   %esi, %edi
-# CHECK-NEXT:  1      3     0.25                        bsrl   %esi, %edi
+# CHECK-NEXT:  1      4     0.25                        bsrl   %esi, %edi
 # CHECK-NEXT:  2      7     0.33    *                   bsfl   (%rax), %edi
-# CHECK-NEXT:  2      7     0.33    *                   bsrl   (%rax), %edi
+# CHECK-NEXT:  2      8     0.33    *                   bsrl   (%rax), %edi
 # CHECK-NEXT:  1      3     0.25                        bsfq   %rsi, %rdi
-# CHECK-NEXT:  1      3     0.25                        bsrq   %rsi, %rdi
+# CHECK-NEXT:  1      4     0.25                        bsrq   %rsi, %rdi
 # CHECK-NEXT:  2      7     0.33    *                   bsfq   (%rax), %rdi
-# CHECK-NEXT:  2      7     0.33    *                   bsrq   (%rax), %rdi
+# CHECK-NEXT:  2      8     0.33    *                   bsrq   (%rax), %rdi
 # CHECK-NEXT:  1      1     1.00                        bswapl %eax
 # CHECK-NEXT:  1      1     1.00                        bswapq %rax
 # CHECK-NEXT:  1      1     0.25                        btw    %si, %di
@@ -1106,13 +1106,13 @@ xorq (%rax), %rdi
 # CHECK-NEXT:  1      100   0.25                  U     cmpsw  %es:(%rdi), (%rsi)
 # CHECK-NEXT:  1      100   0.25                  U     cmpsl  %es:(%rdi), (%rsi)
 # CHECK-NEXT:  1      100   0.25                  U     cmpsq  %es:(%rdi), (%rsi)
-# CHECK-NEXT:  1      1     0.25                        cmpxchgb       %cl, %bl
+# CHECK-NEXT:  1      3     0.25                        cmpxchgb       %cl, %bl
 # CHECK-NEXT:  5      8     0.33    *      *            cmpxchgb       %cl, (%rbx)
-# CHECK-NEXT:  1      1     0.25                        cmpxchgw       %cx, %bx
+# CHECK-NEXT:  1      3     0.25                        cmpxchgw       %cx, %bx
 # CHECK-NEXT:  5      8     0.33    *      *            cmpxchgw       %cx, (%rbx)
-# CHECK-NEXT:  1      1     0.25                        cmpxchgl       %ecx, %ebx
+# CHECK-NEXT:  1      3     0.25                        cmpxchgl       %ecx, %ebx
 # CHECK-NEXT:  5      8     0.33    *      *            cmpxchgl       %ecx, (%rbx)
-# CHECK-NEXT:  1      1     0.25                        cmpxchgq       %rcx, %rbx
+# CHECK-NEXT:  1      3     0.25                        cmpxchgq       %rcx, %rbx
 # CHECK-NEXT:  5      8     0.33    *      *            cmpxchgq       %rcx, (%rbx)
 # CHECK-NEXT:  1      100   0.25                  U     cpuid
 # CHECK-NEXT:  1      1     0.25                        decb   %dil
@@ -1146,9 +1146,9 @@ xorq (%rax), %rdi
 # CHECK-NEXT:  1      7     1.00    *                   imulw  (%rax)
 # CHECK-NEXT:  1      3     1.00                        imulw  %si, %di
 # CHECK-NEXT:  1      7     1.00    *                   imulw  (%rax), %di
-# CHECK-NEXT:  1      3     1.00                        imulw  $511, %si, %di
+# CHECK-NEXT:  1      4     1.00                        imulw  $511, %si, %di
 # CHECK-NEXT:  1      7     1.00    *                   imulw  $511, (%rax), %di
-# CHECK-NEXT:  1      3     1.00                        imulw  $7, %si, %di
+# CHECK-NEXT:  1      4     1.00                        imulw  $7, %si, %di
 # CHECK-NEXT:  1      7     1.00    *                   imulw  $7, (%rax), %di
 # CHECK-NEXT:  1      3     1.00                        imull  %edi
 # CHECK-NEXT:  1      7     1.00    *                   imull  (%rax)