D138359 was reporting that the EXTRACTPSrr override was unnecessary, however the AMD SoG and Agner both confirm that both the rr and rm versions take 2uops (matching znver1)
// r32,x,i.
def Zn2WriteEXTRACTPSr : SchedWriteRes<[Zn2FPU12, Zn2FPU2]> {
let Latency = 2;
+ let NumMicroOps = 2;
let ResourceCycles = [1, 2];
}
def : InstRW<[Zn2WriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>;
# CHECK-NEXT: 1 100 0.25 * vdpps $22, (%rax), %ymm1, %ymm2
# CHECK-NEXT: 1 1 0.33 vextractf128 $1, %ymm0, %xmm2
# CHECK-NEXT: 2 8 0.33 * vextractf128 $1, %ymm0, (%rax)
-# CHECK-NEXT: 1 2 2.00 vextractps $1, %xmm0, %ecx
+# CHECK-NEXT: 2 2 2.00 vextractps $1, %xmm0, %ecx
# CHECK-NEXT: 2 5 2.00 * vextractps $1, %xmm0, (%rax)
# CHECK-NEXT: 1 7 0.25 vhaddpd %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 14 0.33 * vhaddpd (%rax), %xmm1, %xmm2
# CHECK-NEXT: 1 100 0.25 * dppd $22, (%rax), %xmm2
# CHECK-NEXT: 1 15 0.25 dpps $22, %xmm0, %xmm2
# CHECK-NEXT: 2 19 0.33 * dpps $22, (%rax), %xmm2
-# CHECK-NEXT: 1 2 2.00 extractps $1, %xmm0, %ecx
+# CHECK-NEXT: 2 2 2.00 extractps $1, %xmm0, %ecx
# CHECK-NEXT: 2 5 2.00 * extractps $1, %xmm0, (%rax)
# CHECK-NEXT: 1 1 0.50 insertps $1, %xmm0, %xmm2
# CHECK-NEXT: 1 8 0.50 * insertps $1, (%rax), %xmm2