From a947be51bdaf3cb87e58a93126fa0f577865e7f1 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 23 Apr 2020 21:58:00 +0100 Subject: [PATCH] [ARM] Various tests for MVE and FP16 codegen. NFC --- llvm/test/CodeGen/ARM/fp16-bitcast.ll | 38 ++++++++++ llvm/test/CodeGen/Thumb2/mve-vaddqr.ll | 122 ++++++++++++++++++++++++++++- llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll | 135 +++++++++++++++++++++++++++++++++ llvm/test/CodeGen/Thumb2/mve-vdup.ll | 97 +++++++++++++++++++++++ 4 files changed, 391 insertions(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/ARM/fp16-bitcast.ll b/llvm/test/CodeGen/ARM/fp16-bitcast.ll index 6d6b809..e1fdf88 100644 --- a/llvm/test/CodeGen/ARM/fp16-bitcast.ll +++ b/llvm/test/CodeGen/ARM/fp16-bitcast.ll @@ -47,3 +47,41 @@ entry: %tmp4.0.insert.ext = zext i16 %2 to i32 ret i32 %tmp4.0.insert.ext } + +define half @load_i16(i16 *%hp) { +; CHECK-VFPV4-LABEL: load_i16: +; CHECK-VFPV4: @ %bb.0: @ %entry +; CHECK-VFPV4-NEXT: vmov.f32 s0, #1.000000e+00 +; CHECK-VFPV4-NEXT: ldrh r0, [r0] +; CHECK-VFPV4-NEXT: vmov s2, r0 +; CHECK-VFPV4-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-VFPV4-NEXT: vadd.f32 s0, s2, s0 +; CHECK-VFPV4-NEXT: vmov r0, s0 +; CHECK-VFPV4-NEXT: bx lr +; +; CHECK-FP16-LABEL: load_i16: +; CHECK-FP16: @ %bb.0: @ %entry +; CHECK-FP16-NEXT: vldr.16 s2, [r1] +; CHECK-FP16-NEXT: vmov.f16 s0, #1.000000e+00 +; CHECK-FP16-NEXT: vadd.f16 s0, s2, s0 +; CHECK-FP16-NEXT: vstr.16 s0, [r0] +; CHECK-FP16-NEXT: bx lr +entry: + %h = load i16, i16 *%hp, align 2 + %hc = bitcast i16 %h to half + %add = fadd half %hc, 1.0 + ret half %add +} + +define i16 @load_f16(half *%hp) { +; CHECK-LABEL: load_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: adds r0, #1 +; CHECK-NEXT: bx lr +entry: + %h = load half, half *%hp, align 2 + %hc = bitcast half %h to i16 + %add = add i16 %hc, 1 + ret i16 %add +} diff --git a/llvm/test/CodeGen/Thumb2/mve-vaddqr.ll b/llvm/test/CodeGen/Thumb2/mve-vaddqr.ll index da07cd6..4c69761 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vaddqr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vaddqr.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -verify-machineinstrs -mattr=+mve %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -verify-machineinstrs -mattr=+mve.fp %s -o - | FileCheck %s define arm_aapcs_vfpcc <4 x i32> @vaddqr_v4i32(<4 x i32> %src, i32 %src2, <4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vaddqr_v4i32: @@ -72,3 +72,123 @@ entry: %c = add <16 x i8> %sp, %src ret <16 x i8> %c } + +define arm_aapcs_vfpcc <4 x float> @vaddqr_v4f32(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: vaddqr_v4f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vadd.f32 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %i = insertelement <4 x float> undef, float %src2, i32 0 + %sp = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer + %c = fadd <4 x float> %src, %sp + ret <4 x float> %c +} + +define arm_aapcs_vfpcc <8 x half> @vaddqr_v8f16(<8 x half> %src, half *%src2p, <8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: vaddqr_v8f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vadd.f16 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %src2 = load half, half *%src2p, align 2 + %i = insertelement <8 x half> undef, half %src2, i32 0 + %sp = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer + %c = fadd <8 x half> %src, %sp + ret <8 x half> %c +} + +define arm_aapcs_vfpcc <4 x float> @vaddqr_v4f32_2(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: vaddqr_v4f32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vadd.f32 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %i = insertelement <4 x float> undef, float %src2, i32 0 + %sp = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer + %c = fadd <4 x float> %sp, %src + ret <4 x float> %c +} + +define arm_aapcs_vfpcc <8 x half> @vaddqr_v8f16_2(<8 x half> %src, half *%src2p, <8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: vaddqr_v8f16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vadd.f16 q0, q0, r0 +; CHECK-NEXT: bx lr +entry: + %src2 = load half, half *%src2p, align 2 + %i = insertelement <8 x half> undef, half %src2, i32 0 + %sp = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer + %c = fadd <8 x half> %sp, %src + ret <8 x half> %c +} + +define arm_aapcs_vfpcc <4 x float> @vaddqr_v4f32_3(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: vaddqr_v4f32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vdup.32 q1, r0 +; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %src2bc = bitcast float %src2 to i32 + %i = insertelement <4 x i32> undef, i32 %src2bc, i32 0 + %spbc = shufflevector <4 x i32> %i, <4 x i32> undef, <4 x i32> zeroinitializer + %sp = bitcast <4 x i32> %spbc to <4 x float> + %c = fadd <4 x float> %src, %sp + ret <4 x float> %c +} + +define arm_aapcs_vfpcc <8 x half> @vaddqr_v8f16_3(<8 x half> %src, half *%src2p, <8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: vaddqr_v8f16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vdup.16 q1, r0 +; CHECK-NEXT: vadd.f16 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %src2 = load half, half *%src2p, align 2 + %src2bc = bitcast half %src2 to i16 + %i = insertelement <8 x i16> undef, i16 %src2bc, i32 0 + %spbc = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer + %sp = bitcast <8 x i16> %spbc to <8 x half> + %c = fadd <8 x half> %src, %sp + ret <8 x half> %c +} + +define arm_aapcs_vfpcc <4 x float> @vaddqr_v4f32_4(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: vaddqr_v4f32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vdup.32 q1, r0 +; CHECK-NEXT: vadd.f32 q0, q1, q0 +; CHECK-NEXT: bx lr +entry: + %src2bc = bitcast float %src2 to i32 + %i = insertelement <4 x i32> undef, i32 %src2bc, i32 0 + %spbc = shufflevector <4 x i32> %i, <4 x i32> undef, <4 x i32> zeroinitializer + %sp = bitcast <4 x i32> %spbc to <4 x float> + %c = fadd <4 x float> %sp, %src + ret <4 x float> %c +} + +define arm_aapcs_vfpcc <8 x half> @vaddqr_v8f16_4(<8 x half> %src, half *%src2p, <8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: vaddqr_v8f16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vdup.16 q1, r0 +; CHECK-NEXT: vadd.f16 q0, q1, q0 +; CHECK-NEXT: bx lr +entry: + %src2 = load half, half *%src2p, align 2 + %src2bc = bitcast half %src2 to i16 + %i = insertelement <8 x i16> undef, i16 %src2bc, i32 0 + %spbc = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer + %sp = bitcast <8 x i16> %spbc to <8 x half> + %c = fadd <8 x half> %sp, %src + ret <8 x half> %c +} diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll index 1c6c0ff..f03034c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll @@ -5192,3 +5192,138 @@ entry: %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b ret <8 x half> %s } + + + +define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half* %src2p, <8 x half> %a, <8 x half> %b) { +; CHECK-MVE-LABEL: vcmp_oeq_v8f16_bc: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: ldrh r0, [r0] +; CHECK-MVE-NEXT: vmovx.f16 s12, s0 +; CHECK-MVE-NEXT: movs r2, #0 +; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vdup.16 q4, r0 +; CHECK-MVE-NEXT: movs r0, #0 +; CHECK-MVE-NEXT: vmovx.f16 s14, s16 +; CHECK-MVE-NEXT: vmovx.f16 s22, s17 +; CHECK-MVE-NEXT: vcmp.f16 s12, s14 +; CHECK-MVE-NEXT: vmovx.f16 s12, s4 +; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: it eq +; CHECK-MVE-NEXT: moveq r0, #1 +; CHECK-MVE-NEXT: cmp r0, #0 +; CHECK-MVE-NEXT: vcmp.f16 s0, s16 +; CHECK-MVE-NEXT: cset r0, ne +; CHECK-MVE-NEXT: vmovx.f16 s14, s8 +; CHECK-MVE-NEXT: lsls r0, r0, #31 +; CHECK-MVE-NEXT: vmovx.f16 s0, s3 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 +; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: it eq +; CHECK-MVE-NEXT: moveq r2, #1 +; CHECK-MVE-NEXT: cmp r2, #0 +; CHECK-MVE-NEXT: cset r2, ne +; CHECK-MVE-NEXT: vmov r0, s12 +; CHECK-MVE-NEXT: lsls r2, r2, #31 +; CHECK-MVE-NEXT: vcmp.f16 s1, s17 +; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 +; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: vmov r2, s12 +; CHECK-MVE-NEXT: vmov.16 q3[0], r2 +; CHECK-MVE-NEXT: vmov.16 q3[1], r0 +; CHECK-MVE-NEXT: mov.w r0, #0 +; CHECK-MVE-NEXT: it eq +; CHECK-MVE-NEXT: moveq r0, #1 +; CHECK-MVE-NEXT: cmp r0, #0 +; CHECK-MVE-NEXT: cset r0, ne +; CHECK-MVE-NEXT: lsls r0, r0, #31 +; CHECK-MVE-NEXT: vseleq.f16 s20, s9, s5 +; CHECK-MVE-NEXT: vmov r0, s20 +; CHECK-MVE-NEXT: vmovx.f16 s20, s1 +; CHECK-MVE-NEXT: vcmp.f16 s20, s22 +; CHECK-MVE-NEXT: vmov.16 q3[2], r0 +; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: mov.w r0, #0 +; CHECK-MVE-NEXT: it eq +; CHECK-MVE-NEXT: moveq r0, #1 +; CHECK-MVE-NEXT: cmp r0, #0 +; CHECK-MVE-NEXT: cset r0, ne +; CHECK-MVE-NEXT: vmovx.f16 s20, s5 +; CHECK-MVE-NEXT: vmovx.f16 s22, s9 +; CHECK-MVE-NEXT: lsls r0, r0, #31 +; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vcmp.f16 s2, s18 +; CHECK-MVE-NEXT: vmov r0, s20 +; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: vmov.16 q3[3], r0 +; CHECK-MVE-NEXT: mov.w r0, #0 +; CHECK-MVE-NEXT: it eq +; CHECK-MVE-NEXT: moveq r0, #1 +; CHECK-MVE-NEXT: cmp r0, #0 +; CHECK-MVE-NEXT: cset r0, ne +; CHECK-MVE-NEXT: vmovx.f16 s22, s18 +; CHECK-MVE-NEXT: lsls r0, r0, #31 +; CHECK-MVE-NEXT: vseleq.f16 s20, s10, s6 +; CHECK-MVE-NEXT: vmov r0, s20 +; CHECK-MVE-NEXT: vmovx.f16 s20, s2 +; CHECK-MVE-NEXT: vcmp.f16 s20, s22 +; CHECK-MVE-NEXT: vmov.16 q3[4], r0 +; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: mov.w r0, #0 +; CHECK-MVE-NEXT: it eq +; CHECK-MVE-NEXT: moveq r0, #1 +; CHECK-MVE-NEXT: cmp r0, #0 +; CHECK-MVE-NEXT: cset r0, ne +; CHECK-MVE-NEXT: vmovx.f16 s20, s6 +; CHECK-MVE-NEXT: vmovx.f16 s22, s10 +; CHECK-MVE-NEXT: lsls r0, r0, #31 +; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vcmp.f16 s3, s19 +; CHECK-MVE-NEXT: vmov r0, s20 +; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: vmov.16 q3[5], r0 +; CHECK-MVE-NEXT: mov.w r0, #0 +; CHECK-MVE-NEXT: it eq +; CHECK-MVE-NEXT: moveq r0, #1 +; CHECK-MVE-NEXT: cmp r0, #0 +; CHECK-MVE-NEXT: cset r0, ne +; CHECK-MVE-NEXT: vmovx.f16 s2, s19 +; CHECK-MVE-NEXT: vcmp.f16 s0, s2 +; CHECK-MVE-NEXT: lsls r0, r0, #31 +; CHECK-MVE-NEXT: vseleq.f16 s20, s11, s7 +; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: it eq +; CHECK-MVE-NEXT: moveq r1, #1 +; CHECK-MVE-NEXT: vmov r0, s20 +; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmov.16 q3[6], r0 +; CHECK-MVE-NEXT: cset r0, ne +; CHECK-MVE-NEXT: vmovx.f16 s0, s7 +; CHECK-MVE-NEXT: vmovx.f16 s2, s11 +; CHECK-MVE-NEXT: lsls r0, r0, #31 +; CHECK-MVE-NEXT: vseleq.f16 s0, s2, s0 +; CHECK-MVE-NEXT: vmov r0, s0 +; CHECK-MVE-NEXT: vmov.16 q3[7], r0 +; CHECK-MVE-NEXT: vmov q0, q3 +; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16_bc: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: ldrh r0, [r0] +; CHECK-MVEFP-NEXT: vdup.16 q3, r0 +; CHECK-MVEFP-NEXT: vcmp.f16 eq, q0, q3 +; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 +; CHECK-MVEFP-NEXT: bx lr +entry: + %src2 = load half, half* %src2p + %src2bc = bitcast half %src2 to i16 + %i = insertelement <8 x i16> undef, i16 %src2bc, i32 0 + %spbc = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer + %sp = bitcast <8 x i16> %spbc to <8 x half> + %c = fcmp oeq <8 x half> %src, %sp + %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b + ret <8 x half> %s +} diff --git a/llvm/test/CodeGen/Thumb2/mve-vdup.ll b/llvm/test/CodeGen/Thumb2/mve-vdup.ll index f855b12..ae91b52 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vdup.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vdup.ll @@ -75,6 +75,36 @@ entry: ret <4 x float> %out } +define arm_aapcs_vfpcc <4 x float> @vdup_f32_1bc(float %src) { +; CHECK-LABEL: vdup_f32_1bc: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: bx lr +entry: + %srcbc = bitcast float %src to i32 + %0 = insertelement <4 x i32> undef, i32 %srcbc, i32 0 + %out = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer + %outbc = bitcast <4 x i32> %out to <4 x float> + ret <4 x float> %outbc +} + +define arm_aapcs_vfpcc <4 x float> @vdup_f32_2bc(float %src1, float %src2) { +; CHECK-LABEL: vdup_f32_2bc: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vadd.f32 s0, s0, s1 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = fadd float %src1, %src2 + %bc = bitcast float %0 to i32 + %1 = insertelement <4 x i32> undef, i32 %bc, i32 0 + %out = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer + %outbc = bitcast <4 x i32> %out to <4 x float> + ret <4 x float> %outbc +} + ; TODO: Calling convention needs fixing to pass half types directly to functions define arm_aapcs_vfpcc <8 x half> @vdup_f16(half* %src1, half* %src2) { ; CHECK-LABEL: vdup_f16: @@ -94,6 +124,30 @@ entry: ret <8 x half> %out } +define arm_aapcs_vfpcc <8 x half> @vdup_f16_bc(half* %src1, half* %src2) { +; CHECK-LABEL: vdup_f16_bc: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vldr.16 s2, [r0] +; CHECK-NEXT: vadd.f16 s0, s2, s0 +; CHECK-NEXT: vstr.16 s0, [sp, #2] +; CHECK-NEXT: ldrh.w r0, [sp, #2] +; CHECK-NEXT: vdup.16 q0, r0 +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: bx lr +entry: + %0 = load half, half *%src1, align 2 + %1 = load half, half *%src2, align 2 + %2 = fadd half %0, %1 + %bc = bitcast half %2 to i16 + %3 = insertelement <8 x i16> undef, i16 %bc, i32 0 + %out = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer + %outbc = bitcast <8 x i16> %out to <8 x half> + ret <8 x half> %outbc +} + define arm_aapcs_vfpcc <2 x double> @vdup_f64(double %src) { ; CHECK-LABEL: vdup_f64: ; CHECK: @ %bb.0: @ %entry @@ -185,3 +239,46 @@ entry: %out = shufflevector <2 x double> %src, <2 x double> undef, <2 x i32> ret <2 x double> %out } + + +define arm_aapcs_vfpcc float @vdup_f32_extract(float %src) { +; CHECK-LABEL: vdup_f32_extract: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: bx lr +entry: + %srcbc = bitcast float %src to i32 + %0 = insertelement <4 x i32> undef, i32 %srcbc, i32 0 + %out = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer + %outbc = bitcast <4 x i32> %out to <4 x float> + %ext = extractelement <4 x float> %outbc, i32 2 + ret float %ext +} + +define arm_aapcs_vfpcc half @vdup_f16_extract(half* %src1, half* %src2) { +; CHECK-LABEL: vdup_f16_extract: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vldr.16 s2, [r1] +; CHECK-NEXT: vadd.f16 s0, s2, s0 +; CHECK-NEXT: vstr.16 s0, [sp, #2] +; CHECK-NEXT: ldrh.w r1, [sp, #2] +; CHECK-NEXT: vdup.16 q0, r1 +; CHECK-NEXT: vstr.16 s1, [r0] +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: bx lr +entry: + %0 = load half, half *%src1, align 2 + %1 = load half, half *%src2, align 2 + %2 = fadd half %0, %1 + %bc = bitcast half %2 to i16 + %3 = insertelement <8 x i16> undef, i16 %bc, i32 0 + %out = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer + %outbc = bitcast <8 x i16> %out to <8 x half> + %ext = extractelement <8 x half> %outbc, i32 2 + ret half %ext +} -- 2.7.4