Previously we would only check for another commutable operand if the first commute was an aggressive commute.
But if we have two kill operands and neither is tied to the def at the start, we should consider both operands as the one to use as the new def.
This improves the loop in the fma-commute-loop.ll test. This test is derived from a post from discourse here https://llvm.discourse.group/t/unnecessary-vmovapd-instructions-generated-can-you-hint-in-favor-of-vfmadd231pd/582
Differential Revision: https://reviews.llvm.org/D75016
Dist)) {
MadeChange = true;
++NumCommuted;
- if (AggressiveCommute) {
+ if (AggressiveCommute)
++NumAggrCommuted;
- // There might be more than two commutable operands, update BaseOp and
- // continue scanning.
- // FIXME: This assumes that the new instruction's operands are in the
- // same positions and were simply swapped.
- BaseOpReg = OtherOpReg;
- BaseOpKilled = OtherOpKilled;
- // Resamples OpsNum in case the number of operands was reduced. This
- // happens with X86.
- OpsNum = MI->getDesc().getNumOperands();
- continue;
- }
- // If this was a commute based on kill, we won't do better continuing.
- return MadeChange;
+
+ // There might be more than two commutable operands, update BaseOp and
+ // continue scanning.
+ // FIXME: This assumes that the new instruction's operands are in the
+ // same positions and were simply swapped.
+ BaseOpReg = OtherOpReg;
+ BaseOpKilled = OtherOpKilled;
+ // Resamples OpsNum in case the number of operands was reduced. This
+ // happens with X86.
+ OpsNum = MI->getDesc().getNumOperands();
}
}
return MadeChange;
; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %r12
; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vxorpd %xmm5, %xmm5, %xmm5
-; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: LBB0_1: ## %bb15
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovapd %zmm5, %zmm6
-; CHECK-NEXT: vmovapd %zmm4, %zmm7
-; CHECK-NEXT: vmovupd (%rax,%r11,8), %zmm4
-; CHECK-NEXT: vmovupd (%rax,%r13,8), %zmm5
+; CHECK-NEXT: vmovupd (%rax,%r11,8), %zmm6
+; CHECK-NEXT: vmovupd (%rax,%r13,8), %zmm7
; CHECK-NEXT: vmovupd (%rax,%r12,8), %zmm8
; CHECK-NEXT: vbroadcastsd (%r15,%rbx,8), %zmm9
-; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm4 * zmm9) + zmm0
-; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm5 * zmm9) + zmm1
+; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm6 * zmm9) + zmm0
+; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm7 * zmm9) + zmm1
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm8 * zmm9) + zmm2
; CHECK-NEXT: vbroadcastsd (%r14,%rbx,8), %zmm9
-; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm4 = (zmm9 * zmm4) + zmm7
-; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm5 = (zmm9 * zmm5) + zmm6
-; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm3 = (zmm8 * zmm9) + zmm3
+; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm3 = (zmm9 * zmm6) + zmm3
+; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm4 = (zmm9 * zmm7) + zmm4
+; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm5 = (zmm8 * zmm9) + zmm5
; CHECK-NEXT: incq %rbx
; CHECK-NEXT: cmpq %rbx, %r10
; CHECK-NEXT: jne LBB0_1
; CHECK-NEXT: vmovapd %zmm0, (%rdi)
; CHECK-NEXT: vmovapd %zmm1, (%rsi)
; CHECK-NEXT: vmovapd %zmm2, (%rdx)
-; CHECK-NEXT: vmovapd %zmm4, (%rcx)
-; CHECK-NEXT: vmovapd %zmm5, (%r8)
-; CHECK-NEXT: vmovapd %zmm3, (%r9)
+; CHECK-NEXT: vmovapd %zmm3, (%rcx)
+; CHECK-NEXT: vmovapd %zmm4, (%r8)
+; CHECK-NEXT: vmovapd %zmm5, (%r9)
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r12
; CHECK-NEXT: popq %r13
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpss %xmm1, %xmm1, %xmm2
; FMA-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm3
-; FMA-RECIP-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0
-; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3
-; FMA-RECIP-NEXT: vmovaps %xmm2, %xmm0
+; FMA-RECIP-NEXT: vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
+; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
; FMA-RECIP-NEXT: retq
;
; BDVER2-LABEL: f32_one_step_variables:
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpss %xmm1, %xmm1, %xmm2
; HASWELL-NEXT: vmulss %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0
-; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3
-; HASWELL-NEXT: vmovaps %xmm2, %xmm0
+; HASWELL-NEXT: vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
+; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
; HASWELL-NEXT: retq
;
; HASWELL-NO-FMA-LABEL: f32_one_step_variables:
; AVX512: # %bb.0:
; AVX512-NEXT: vrcpss %xmm1, %xmm1, %xmm2
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm3
-; AVX512-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0
-; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3
-; AVX512-NEXT: vmovaps %xmm2, %xmm0
+; AVX512-NEXT: vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
+; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
; AVX512-NEXT: retq
%div = fdiv fast float %x, %y
ret float %div
;
; HASWELL-LABEL: v4f32_one_step:
; HASWELL: # %bb.0:
-; HASWELL-NEXT: vrcpps %xmm0, %xmm1
-; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
-; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
+; HASWELL-NEXT: vrcpps %xmm0, %xmm2
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT: vfmsub231ps {{.*#+}} xmm1 = (xmm2 * xmm0) - xmm1
+; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm2
+; HASWELL-NEXT: vmovaps %xmm1, %xmm0
; HASWELL-NEXT: retq
;
; HASWELL-NO-FMA-LABEL: v4f32_one_step:
;
; KNL-LABEL: v4f32_one_step:
; KNL: # %bb.0:
-; KNL-NEXT: vrcpps %xmm0, %xmm1
-; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; KNL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
-; KNL-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
+; KNL-NEXT: vrcpps %xmm0, %xmm2
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; KNL-NEXT: vfmsub231ps {{.*#+}} xmm1 = (xmm2 * xmm0) - xmm1
+; KNL-NEXT: vfnmadd132ps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm2
+; KNL-NEXT: vmovaps %xmm1, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: v4f32_one_step:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %xmm1, %xmm2
; FMA-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm3
-; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0
-; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3
-; FMA-RECIP-NEXT: vmovaps %xmm2, %xmm0
+; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
+; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
; FMA-RECIP-NEXT: retq
;
; BDVER2-LABEL: v4f32_one_step_variables:
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %xmm1, %xmm2
; HASWELL-NEXT: vmulps %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0
-; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3
-; HASWELL-NEXT: vmovaps %xmm2, %xmm0
+; HASWELL-NEXT: vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
+; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
; HASWELL-NEXT: retq
;
; HASWELL-NO-FMA-LABEL: v4f32_one_step_variables:
; AVX512: # %bb.0:
; AVX512-NEXT: vrcpps %xmm1, %xmm2
; AVX512-NEXT: vmulps %xmm2, %xmm0, %xmm3
-; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0
-; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3
-; AVX512-NEXT: vmovaps %xmm2, %xmm0
+; AVX512-NEXT: vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
+; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
; AVX512-NEXT: retq
%div = fdiv fast <4 x float> %x, %y
ret <4 x float> %div
;
; HASWELL-LABEL: v8f32_one_step:
; HASWELL: # %bb.0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm1
-; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
-; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
+; HASWELL-NEXT: vrcpps %ymm0, %ymm2
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NEXT: vfmsub231ps {{.*#+}} ymm1 = (ymm2 * ymm0) - ymm1
+; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2
+; HASWELL-NEXT: vmovaps %ymm1, %ymm0
; HASWELL-NEXT: retq
;
; HASWELL-NO-FMA-LABEL: v8f32_one_step:
;
; KNL-LABEL: v8f32_one_step:
; KNL: # %bb.0:
-; KNL-NEXT: vrcpps %ymm0, %ymm1
-; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; KNL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
-; KNL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
+; KNL-NEXT: vrcpps %ymm0, %ymm2
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; KNL-NEXT: vfmsub231ps {{.*#+}} ymm1 = (ymm2 * ymm0) - ymm1
+; KNL-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2
+; KNL-NEXT: vmovaps %ymm1, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: v8f32_one_step:
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1
; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
-; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1
-; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0
+; HASWELL-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm1 * xmm0) - xmm2
+; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm2, %xmm0
+; HASWELL-NEXT: vmulps %xmm2, %xmm0, %xmm0
; HASWELL-NEXT: retq
;
; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs:
; KNL: # %bb.0:
; KNL-NEXT: vrcpps %xmm0, %xmm1
; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; KNL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
-; KNL-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
-; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1
-; KNL-NEXT: vmulps %xmm0, %xmm1, %xmm0
+; KNL-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm1 * xmm0) - xmm2
+; KNL-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
+; KNL-NEXT: vmulps {{.*}}(%rip), %xmm2, %xmm0
+; KNL-NEXT: vmulps %xmm2, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: v4f32_one_step_2_divs:
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %ymm0, %ymm1
; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
-; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1
-; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0
+; HASWELL-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm1 * ymm0) - ymm2
+; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm2, %ymm0
+; HASWELL-NEXT: vmulps %ymm2, %ymm0, %ymm0
; HASWELL-NEXT: retq
;
; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs:
; KNL: # %bb.0:
; KNL-NEXT: vrcpps %ymm0, %ymm1
; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; KNL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
-; KNL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
-; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1
-; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0
+; KNL-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm1 * ymm0) - ymm2
+; KNL-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1
+; KNL-NEXT: vmulps {{.*}}(%rip), %ymm2, %ymm0
+; KNL-NEXT: vmulps %ymm2, %ymm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: v8f32_one_step_2_divs:
; AVX512-NEXT: vrsqrtps %xmm0, %xmm1
; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
-; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
-; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
+; AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vmulps %xmm2, %xmm0, %xmm0
; AVX512-NEXT: retq
%sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
; AVX512-NEXT: vrsqrtps %ymm0, %ymm1
; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
-; AVX512-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
-; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; AVX512-NEXT: vmulps %ymm2, %ymm1, %ymm1
+; AVX512-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm1 * ymm0) + ymm2
+; AVX512-NEXT: vbroadcastss {{.*#+}} ymm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; AVX512-NEXT: vmulps %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vmulps %ymm2, %ymm0, %ymm0
; AVX512-NEXT: retq
%sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt