From 734e2386ffb34e5ab5fbdc1063fd11e6a2a632ce Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 24 Nov 2021 09:51:33 +0000 Subject: [PATCH] [ARM] Add fma and update fadd/fmul predicated select tests. NFC --- llvm/test/CodeGen/Thumb2/mve-pred-selectop2.ll | 60 ++++++ llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll | 250 +++++++++++++++++-------- 2 files changed, 234 insertions(+), 76 deletions(-) diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-selectop2.ll b/llvm/test/CodeGen/Thumb2/mve-pred-selectop2.ll index dbff132..d8376ad 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-selectop2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-selectop2.ll @@ -427,6 +427,64 @@ entry: ret <8 x half> %b } +define arm_aapcs_vfpcc <4 x float> @fmai_v4f32_x(<4 x float> %x, <4 x float> %y, <4 x float> %z, i32 %n) { +; CHECK-LABEL: fmai_v4f32_x: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vctp.32 r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vfmat.f32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) + %a = call <4 x float> @llvm.fma.v4f32(<4 x float> %y, <4 x float> %z, <4 x float> %x) + %b = select <4 x i1> %c, <4 x float> %a, <4 x float> %x + ret <4 x float> %b +} + +define arm_aapcs_vfpcc <8 x half> @fmai_v8f16_x(<8 x half> %x, <8 x half> %y, <8 x half> %z, i32 %n) { +; CHECK-LABEL: fmai_v8f16_x: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vctp.16 r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vfmat.f16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) + %a = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %x) + %b = select <8 x i1> %c, <8 x half> %a, <8 x half> %x + ret <8 x half> %b +} + +define arm_aapcs_vfpcc <4 x float> @fma_v4f32_x(<4 x float> %x, <4 x float> %y, <4 x float> %z, i32 %n) { +; CHECK-LABEL: fma_v4f32_x: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vctp.32 r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vfmat.f32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) + %m = fmul fast <4 x float> %y, %z + %a = fadd fast <4 x float> %m, %x + %b = select <4 x i1> %c, <4 x float> %a, <4 x float> %x + ret <4 x float> %b +} + +define arm_aapcs_vfpcc <8 x half> @fma_v8f16_x(<8 x half> %x, <8 x half> %y, <8 x half> %z, i32 %n) { +; CHECK-LABEL: fma_v8f16_x: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vctp.16 r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vfmat.f16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) + %m = fmul fast <8 x half> %y, %z + %a = fadd fast <8 x half> %m, %x + %b = select <8 x i1> %c, <8 x half> %a, <8 x half> %x + ret <8 x half> %b +} + define arm_aapcs_vfpcc <4 x i32> @icmp_slt_v4i32_x(<4 x i32> %x, <4 x i32> %y, i32 %n) { ; CHECK-LABEL: icmp_slt_v4i32_x: ; CHECK: @ %bb.0: @ %entry @@ -2669,6 +2727,8 @@ declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>) declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) +declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>) declare <16 x i1> @llvm.arm.mve.vctp8(i32) declare <8 x i1> @llvm.arm.mve.vctp16(i32) diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll b/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll index 20d32d2..2ad93c4b 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll @@ -352,84 +352,164 @@ entry: define arm_aapcs_vfpcc <4 x float> @fadd_v4f32_x(<4 x float> %x, <4 x float> %y, i32 %n) { ; CHECK-LABEL: fadd_v4f32_x: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q2, #0x80000000 ; CHECK-NEXT: vctp.32 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vaddt.f32 q0, q0, q1 +; CHECK-NEXT: vmovt q2, q1 +; CHECK-NEXT: vadd.f32 q0, q2, q0 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) - %a = fadd <4 x float> %x, %y - %b = select <4 x i1> %c, <4 x float> %a, <4 x float> %x + %a = select <4 x i1> %c, <4 x float> %y, <4 x float> + %b = fadd <4 x float> %a, %x ret <4 x float> %b } define arm_aapcs_vfpcc <8 x half> @fadd_v8f16_x(<8 x half> %x, <8 x half> %y, i32 %n) { ; CHECK-LABEL: fadd_v8f16_x: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i16 q2, #0x8000 ; CHECK-NEXT: vctp.16 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vaddt.f16 q0, q0, q1 +; CHECK-NEXT: vmovt q2, q1 +; CHECK-NEXT: vadd.f16 q0, q2, q0 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) - %a = fadd <8 x half> %x, %y - %b = select <8 x i1> %c, <8 x half> %a, <8 x half> %x + %a = select <8 x i1> %c, <8 x half> %y, <8 x half> + %b = fadd <8 x half> %a, %x ret <8 x half> %b } define arm_aapcs_vfpcc <4 x float> @fsub_v4f32_x(<4 x float> %x, <4 x float> %y, i32 %n) { ; CHECK-LABEL: fsub_v4f32_x: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.32 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vsubt.f32 q0, q0, q1 +; CHECK-NEXT: vmovt q2, q1 +; CHECK-NEXT: vsub.f32 q0, q0, q2 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) - %a = fsub <4 x float> %x, %y - %b = select <4 x i1> %c, <4 x float> %a, <4 x float> %x + %a = select <4 x i1> %c, <4 x float> %y, <4 x float> zeroinitializer + %b = fsub <4 x float> %x, %a ret <4 x float> %b } define arm_aapcs_vfpcc <8 x half> @fsub_v8f16_x(<8 x half> %x, <8 x half> %y, i32 %n) { ; CHECK-LABEL: fsub_v8f16_x: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vctp.16 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vsubt.f16 q0, q0, q1 +; CHECK-NEXT: vmovt q2, q1 +; CHECK-NEXT: vsub.f16 q0, q0, q2 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) - %a = fsub <8 x half> %x, %y - %b = select <8 x i1> %c, <8 x half> %a, <8 x half> %x + %a = select <8 x i1> %c, <8 x half> %y, <8 x half> zeroinitializer + %b = fsub <8 x half> %x, %a ret <8 x half> %b } define arm_aapcs_vfpcc <4 x float> @fmul_v4f32_x(<4 x float> %x, <4 x float> %y, i32 %n) { ; CHECK-LABEL: fmul_v4f32_x: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 q2, #1.000000e+00 ; CHECK-NEXT: vctp.32 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.f32 q0, q0, q1 +; CHECK-NEXT: vmovt q2, q1 +; CHECK-NEXT: vmul.f32 q0, q2, q0 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) - %a = fmul <4 x float> %x, %y - %b = select <4 x i1> %c, <4 x float> %a, <4 x float> %x + %a = select <4 x i1> %c, <4 x float> %y, <4 x float> + %b = fmul <4 x float> %a, %x ret <4 x float> %b } define arm_aapcs_vfpcc <8 x half> @fmul_v8f16_x(<8 x half> %x, <8 x half> %y, i32 %n) { ; CHECK-LABEL: fmul_v8f16_x: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i16 q2, #0x3c00 ; CHECK-NEXT: vctp.16 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.f16 q0, q0, q1 +; CHECK-NEXT: vmovt q2, q1 +; CHECK-NEXT: vmul.f16 q0, q2, q0 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) - %a = fmul <8 x half> %x, %y - %b = select <8 x i1> %c, <8 x half> %a, <8 x half> %x + %a = select <8 x i1> %c, <8 x half> %y, <8 x half> + %b = fmul <8 x half> %a, %x + ret <8 x half> %b +} + +define arm_aapcs_vfpcc <4 x float> @fmai_v4f32_x(<4 x float> %x, <4 x float> %y, <4 x float> %z, i32 %n) { +; CHECK-LABEL: fmai_v4f32_x: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q3, #0x80000000 +; CHECK-NEXT: vctp.32 r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmovt q3, q0 +; CHECK-NEXT: vfma.f32 q3, q1, q2 +; CHECK-NEXT: vmov q0, q3 +; CHECK-NEXT: bx lr +entry: + %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) + %a = select <4 x i1> %c, <4 x float> %x, <4 x float> + %b = call <4 x float> @llvm.fma.v4f32(<4 x float> %y, <4 x float> %z, <4 x float> %a) + ret <4 x float> %b +} + +define arm_aapcs_vfpcc <8 x half> @fmai_v8f16_x(<8 x half> %x, <8 x half> %y, <8 x half> %z, i32 %n) { +; CHECK-LABEL: fmai_v8f16_x: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i16 q3, #0x8000 +; CHECK-NEXT: vctp.16 r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmovt q3, q0 +; CHECK-NEXT: vfma.f16 q3, q1, q2 +; CHECK-NEXT: vmov q0, q3 +; CHECK-NEXT: bx lr +entry: + %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) + %a = select <8 x i1> %c, <8 x half> %x, <8 x half> + %b = call <8 x half> @llvm.fma.v8f16(<8 x half> %y, <8 x half> %z, <8 x half> %a) + ret <8 x half> %b +} + +define arm_aapcs_vfpcc <4 x float> @fma_v4f32_x(<4 x float> %x, <4 x float> %y, <4 x float> %z, i32 %n) { +; CHECK-LABEL: fma_v4f32_x: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q3, #0x80000000 +; CHECK-NEXT: vctp.32 r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.f32 q3, q1, q2 +; CHECK-NEXT: vadd.f32 q0, q3, q0 +; CHECK-NEXT: bx lr +entry: + %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) + %m = fmul fast <4 x float> %y, %z + %a = select <4 x i1> %c, <4 x float> %m, <4 x float> + %b = fadd fast <4 x float> %a, %x + ret <4 x float> %b +} + +define arm_aapcs_vfpcc <8 x half> @fma_v8f16_x(<8 x half> %x, <8 x half> %y, <8 x half> %z, i32 %n) { +; CHECK-LABEL: fma_v8f16_x: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i16 q3, #0x8000 +; CHECK-NEXT: vctp.16 r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.f16 q3, q1, q2 +; CHECK-NEXT: vadd.f16 q0, q3, q0 +; CHECK-NEXT: bx lr +entry: + %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) + %m = fmul fast <8 x half> %y, %z + %a = select <8 x i1> %c, <8 x half> %m, <8 x half> + %b = fadd fast <8 x half> %a, %x ret <8 x half> %b } @@ -988,17 +1068,19 @@ entry: define arm_aapcs_vfpcc <4 x float> @faddqr_v4f32_x(<4 x float> %x, float %y, i32 %n) { ; CHECK-LABEL: faddqr_v4f32_x: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q2, #0x80000000 ; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vctp.32 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vaddt.f32 q0, q0, r1 +; CHECK-NEXT: vdupt.32 q2, r1 +; CHECK-NEXT: vadd.f32 q0, q2, q0 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) %i = insertelement <4 x float> undef, float %y, i32 0 %ys = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer - %a = fadd <4 x float> %ys, %x - %b = select <4 x i1> %c, <4 x float> %a, <4 x float> %x + %a = select <4 x i1> %c, <4 x float> %ys, <4 x float> + %b = fadd <4 x float> %a, %x ret <4 x float> %b } @@ -1006,33 +1088,37 @@ define arm_aapcs_vfpcc <8 x half> @faddqr_v8f16_x(<8 x half> %x, half %y, i32 %n ; CHECK-LABEL: faddqr_v8f16_x: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f16 r1, s4 +; CHECK-NEXT: vmov.i16 q1, #0x8000 ; CHECK-NEXT: vctp.16 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vaddt.f16 q0, q0, r1 +; CHECK-NEXT: vdupt.16 q1, r1 +; CHECK-NEXT: vadd.f16 q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) %i = insertelement <8 x half> undef, half %y, i32 0 %ys = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer - %a = fadd <8 x half> %ys, %x - %b = select <8 x i1> %c, <8 x half> %a, <8 x half> %x + %a = select <8 x i1> %c, <8 x half> %ys, <8 x half> + %b = fadd <8 x half> %a, %x ret <8 x half> %b } define arm_aapcs_vfpcc <4 x float> @fsubqr_v4f32_x(<4 x float> %x, float %y, i32 %n) { ; CHECK-LABEL: fsubqr_v4f32_x: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vctp.32 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vsubt.f32 q0, q0, r1 +; CHECK-NEXT: vdupt.32 q2, r1 +; CHECK-NEXT: vsub.f32 q0, q0, q2 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) %i = insertelement <4 x float> undef, float %y, i32 0 %ys = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer - %a = fsub <4 x float> %x, %ys - %b = select <4 x i1> %c, <4 x float> %a, <4 x float> %x + %a = select <4 x i1> %c, <4 x float> %ys, <4 x float> zeroinitializer + %b = fsub <4 x float> %x, %a ret <4 x float> %b } @@ -1040,33 +1126,37 @@ define arm_aapcs_vfpcc <8 x half> @fsubqr_v8f16_x(<8 x half> %x, half %y, i32 %n ; CHECK-LABEL: fsubqr_v8f16_x: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f16 r1, s4 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: vctp.16 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vsubt.f16 q0, q0, r1 +; CHECK-NEXT: vdupt.16 q1, r1 +; CHECK-NEXT: vsub.f16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) %i = insertelement <8 x half> undef, half %y, i32 0 %ys = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer - %a = fsub <8 x half> %x, %ys - %b = select <8 x i1> %c, <8 x half> %a, <8 x half> %x + %a = select <8 x i1> %c, <8 x half> %ys, <8 x half> zeroinitializer + %b = fsub <8 x half> %x, %a ret <8 x half> %b } define arm_aapcs_vfpcc <4 x float> @fmulqr_v4f32_x(<4 x float> %x, float %y, i32 %n) { ; CHECK-LABEL: fmulqr_v4f32_x: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 q2, #1.000000e+00 ; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vctp.32 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.f32 q0, q0, r1 +; CHECK-NEXT: vdupt.32 q2, r1 +; CHECK-NEXT: vmul.f32 q0, q2, q0 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) %i = insertelement <4 x float> undef, float %y, i32 0 %ys = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer - %a = fmul <4 x float> %ys, %x - %b = select <4 x i1> %c, <4 x float> %a, <4 x float> %x + %a = select <4 x i1> %c, <4 x float> %ys, <4 x float> + %b = fmul <4 x float> %a, %x ret <4 x float> %b } @@ -1074,16 +1164,18 @@ define arm_aapcs_vfpcc <8 x half> @fmulqr_v8f16_x(<8 x half> %x, half %y, i32 %n ; CHECK-LABEL: fmulqr_v8f16_x: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f16 r1, s4 +; CHECK-NEXT: vmov.i16 q1, #0x3c00 ; CHECK-NEXT: vctp.16 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.f16 q0, q0, r1 +; CHECK-NEXT: vdupt.16 q1, r1 +; CHECK-NEXT: vmul.f16 q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) %i = insertelement <8 x half> undef, half %y, i32 0 %ys = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer - %a = fmul <8 x half> %ys, %x - %b = select <8 x i1> %c, <8 x half> %a, <8 x half> %x + %a = select <8 x i1> %c, <8 x half> %ys, <8 x half> + %b = fmul <8 x half> %a, %x ret <8 x half> %b } @@ -1648,30 +1740,32 @@ entry: define arm_aapcs_vfpcc <4 x float> @fadd_v4f32_y(<4 x float> %x, <4 x float> %y, i32 %n) { ; CHECK-LABEL: fadd_v4f32_y: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q2, #0x80000000 ; CHECK-NEXT: vctp.32 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vaddt.f32 q1, q0, q1 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmovt q2, q0 +; CHECK-NEXT: vadd.f32 q0, q2, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) - %a = fadd <4 x float> %x, %y - %b = select <4 x i1> %c, <4 x float> %a, <4 x float> %y + %a = select <4 x i1> %c, <4 x float> %x, <4 x float> + %b = fadd <4 x float> %a, %y ret <4 x float> %b } define arm_aapcs_vfpcc <8 x half> @fadd_v8f16_y(<8 x half> %x, <8 x half> %y, i32 %n) { ; CHECK-LABEL: fadd_v8f16_y: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i16 q2, #0x8000 ; CHECK-NEXT: vctp.16 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vaddt.f16 q1, q0, q1 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmovt q2, q0 +; CHECK-NEXT: vadd.f16 q0, q2, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) - %a = fadd <8 x half> %x, %y - %b = select <8 x i1> %c, <8 x half> %a, <8 x half> %y + %a = select <8 x i1> %c, <8 x half> %x, <8 x half> + %b = fadd <8 x half> %a, %y ret <8 x half> %b } @@ -1708,30 +1802,32 @@ entry: define arm_aapcs_vfpcc <4 x float> @fmul_v4f32_y(<4 x float> %x, <4 x float> %y, i32 %n) { ; CHECK-LABEL: fmul_v4f32_y: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 q2, #1.000000e+00 ; CHECK-NEXT: vctp.32 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.f32 q1, q0, q1 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmovt q2, q0 +; CHECK-NEXT: vmul.f32 q0, q2, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) - %a = fmul <4 x float> %x, %y - %b = select <4 x i1> %c, <4 x float> %a, <4 x float> %y + %a = select <4 x i1> %c, <4 x float> %x, <4 x float> + %b = fmul <4 x float> %a, %y ret <4 x float> %b } define arm_aapcs_vfpcc <8 x half> @fmul_v8f16_y(<8 x half> %x, <8 x half> %y, i32 %n) { ; CHECK-LABEL: fmul_v8f16_y: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i16 q2, #0x3c00 ; CHECK-NEXT: vctp.16 r0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.f16 q1, q0, q1 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmovt q2, q0 +; CHECK-NEXT: vmul.f16 q0, q2, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) - %a = fmul <8 x half> %x, %y - %b = select <8 x i1> %c, <8 x half> %a, <8 x half> %y + %a = select <8 x i1> %c, <8 x half> %x, <8 x half> + %b = fmul <8 x half> %a, %y ret <8 x half> %b } @@ -2352,38 +2448,38 @@ entry: define arm_aapcs_vfpcc <4 x float> @faddqr_v4f32_y(<4 x float> %x, float %y, i32 %n) { ; CHECK-LABEL: faddqr_v4f32_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vctp.32 r0 -; CHECK-NEXT: vdup.32 q1, r1 +; CHECK-NEXT: vmov.i32 q2, #0x80000000 ; CHECK-NEXT: vpst -; CHECK-NEXT: vaddt.f32 q1, q0, r1 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmovt q2, q0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vadd.f32 q0, q2, r0 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) %i = insertelement <4 x float> undef, float %y, i32 0 %ys = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer - %a = fadd <4 x float> %ys, %x - %b = select <4 x i1> %c, <4 x float> %a, <4 x float> %ys + %a = select <4 x i1> %c, <4 x float> %x, <4 x float> + %b = fadd <4 x float> %ys, %a ret <4 x float> %b } define arm_aapcs_vfpcc <8 x half> @faddqr_v8f16_y(<8 x half> %x, half %y, i32 %n) { ; CHECK-LABEL: faddqr_v8f16_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 r1, s4 ; CHECK-NEXT: vctp.16 r0 -; CHECK-NEXT: vdup.16 q1, r1 +; CHECK-NEXT: vmov.i16 q2, #0x8000 ; CHECK-NEXT: vpst -; CHECK-NEXT: vaddt.f16 q1, q0, r1 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmovt q2, q0 +; CHECK-NEXT: vmov.f16 r0, s4 +; CHECK-NEXT: vadd.f16 q0, q2, r0 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) %i = insertelement <8 x half> undef, half %y, i32 0 %ys = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer - %a = fadd <8 x half> %ys, %x - %b = select <8 x i1> %c, <8 x half> %a, <8 x half> %ys + %a = select <8 x i1> %c, <8 x half> %x, <8 x half> + %b = fadd <8 x half> %ys, %a ret <8 x half> %b } @@ -2428,38 +2524,38 @@ entry: define arm_aapcs_vfpcc <4 x float> @fmulqr_v4f32_y(<4 x float> %x, float %y, i32 %n) { ; CHECK-LABEL: fmulqr_v4f32_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vctp.32 r0 -; CHECK-NEXT: vdup.32 q1, r1 +; CHECK-NEXT: vmov.f32 q2, #1.000000e+00 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.f32 q1, q0, r1 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmovt q2, q0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmul.f32 q0, q2, r0 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) %i = insertelement <4 x float> undef, float %y, i32 0 %ys = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer - %a = fmul <4 x float> %ys, %x - %b = select <4 x i1> %c, <4 x float> %a, <4 x float> %ys + %a = select <4 x i1> %c, <4 x float> %x, <4 x float> + %b = fmul <4 x float> %ys, %a ret <4 x float> %b } define arm_aapcs_vfpcc <8 x half> @fmulqr_v8f16_y(<8 x half> %x, half %y, i32 %n) { ; CHECK-LABEL: fmulqr_v8f16_y: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 r1, s4 ; CHECK-NEXT: vctp.16 r0 -; CHECK-NEXT: vdup.16 q1, r1 +; CHECK-NEXT: vmov.i16 q2, #0x3c00 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.f16 q1, q0, r1 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmovt q2, q0 +; CHECK-NEXT: vmov.f16 r0, s4 +; CHECK-NEXT: vmul.f16 q0, q2, r0 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) %i = insertelement <8 x half> undef, half %y, i32 0 %ys = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer - %a = fmul <8 x half> %ys, %x - %b = select <8 x i1> %c, <8 x half> %a, <8 x half> %ys + %a = select <8 x i1> %c, <8 x half> %x, <8 x half> + %b = fmul <8 x half> %ys, %a ret <8 x half> %b } @@ -2691,6 +2787,8 @@ declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>) declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) +declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>) declare <16 x i1> @llvm.arm.mve.vctp8(i32) declare <8 x i1> @llvm.arm.mve.vctp16(i32) -- 2.7.4