From 8105f555af3617b273740a64d2ad50fdf595d7a1 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 7 Sep 2022 18:35:29 +0100 Subject: [PATCH] [AArch64] Add tests for using tbl for fp conversions. --- llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll | 477 ++++++++++++++++++++++ 1 file changed, 477 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll diff --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll new file mode 100644 index 0000000..8ee68a2 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll @@ -0,0 +1,477 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -o - %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-ios" + +; It's profitable to convert the fptoui float -> i8 to first convert from +; float -> i32 and then use tbl for the truncate in a loop, so the mask can be +; materialized outside the loop. +define void @fptoui_v8f32_to_v8i8_in_loop(ptr %A, ptr %dst) { +; CHECK-LABEL: fptoui_v8f32_to_v8i8_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB0_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x9, x0, x8, lsl #5 +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: ldp q0, q1, [x9] +; CHECK-NEXT: fcvtzs.4s v0, v0 +; CHECK-NEXT: fcvtzs.4s v1, v1 +; CHECK-NEXT: xtn.4h v0, v0 +; CHECK-NEXT: xtn.4h v1, v1 +; CHECK-NEXT: uzp1.8b v0, v0, v1 +; CHECK-NEXT: str d0, [x1], #16 +; CHECK-NEXT: b.eq LBB0_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr inbounds <8 x float>, ptr %A, i64 %iv + %l.A = load <8 x float>, ptr %gep.A + %c = fptoui <8 x float> %l.A to <8 x i8> + %gep.dst = getelementptr inbounds <16 x i8>, ptr %dst, i64 %iv + store <8 x i8> %c, ptr %gep.dst + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + +; Not profitable to use tbl, as materializing the masks requires more +; instructions. +define void @fptoui_v8f32_to_v8i8_no_loop(ptr %A, ptr %dst) { +; CHECK-LABEL: fptoui_v8f32_to_v8i8_no_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: fcvtzs.4s v0, v0 +; CHECK-NEXT: fcvtzs.4s v1, v1 +; CHECK-NEXT: xtn.4h v0, v0 +; CHECK-NEXT: xtn.4h v1, v1 +; CHECK-NEXT: uzp1.8b v0, v0, v1 +; CHECK-NEXT: str d0, [x1] +; CHECK-NEXT: ret +entry: + %l.A = load <8 x float>, ptr %A + %c = fptoui <8 x float> %l.A to <8 x i8> + store <8 x i8> %c, ptr %dst + ret void +} + +; Tbl can also be used when combining multiple fptoui using a shuffle. The loop +; vectorizer may create such patterns. +define void @fptoui_2x_v8f32_to_v8i8_in_loop(ptr %A, ptr %B, ptr %dst) { +; CHECK-LABEL: fptoui_2x_v8f32_to_v8i8_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB2_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: lsl x9, x8, #5 +; CHECK-NEXT: add x10, x1, x9 +; CHECK-NEXT: add x9, x0, x9 +; CHECK-NEXT: ldp q1, q0, [x10] +; CHECK-NEXT: fcvtzs.4s v1, v1 +; CHECK-NEXT: ldp q3, q2, [x9] +; CHECK-NEXT: fcvtzs.4s v0, v0 +; CHECK-NEXT: xtn.4h v1, v1 +; CHECK-NEXT: fcvtzs.4s v3, v3 +; CHECK-NEXT: xtn.4h v0, v0 +; CHECK-NEXT: fcvtzs.4s v2, v2 +; CHECK-NEXT: mov.d v1[1], v0[0] +; CHECK-NEXT: xtn.4h v3, v3 +; CHECK-NEXT: xtn.4h v2, v2 +; CHECK-NEXT: mov.d v3[1], v2[0] +; CHECK-NEXT: uzp1.16b v0, v3, v1 +; CHECK-NEXT: str q0, [x2, x8, lsl #4] +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: b.eq LBB2_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr inbounds <8 x float>, ptr %A, i64 %iv + %gep.B = getelementptr inbounds <8x float>, ptr %B, i64 %iv + %l.A = load <8 x float>, ptr %gep.A + %l.B = load <8 x float>, ptr %gep.B + %c1 = fptoui <8 x float> %l.A to <8 x i8> + %c2 = fptoui <8 x float> %l.B to <8 x i8> + %s = shufflevector <8 x i8> %c1, <8 x i8> %c2, <16 x i32> + %gep.dst = getelementptr inbounds <16 x i8>, ptr %dst, i64 %iv + store <16 x i8> %s, ptr %gep.dst + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + +; Should not use tbl lowering, as the shuffle isn't a concat-shuffle. +define void @fptoui_2x_v8f32_to_v8i8_in_loop_no_concat_shuffle(ptr %A, ptr %B, ptr %dst) { +; CHECK-LABEL: fptoui_2x_v8f32_to_v8i8_in_loop_no_concat_shuffle: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh0: +; CHECK-NEXT: adrp x9, lCPI3_0@PAGE +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh1: +; CHECK-NEXT: ldr q0, [x9, lCPI3_0@PAGEOFF] +; CHECK-NEXT: LBB3_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: lsl x9, x8, #5 +; CHECK-NEXT: add x10, x0, x9 +; CHECK-NEXT: add x9, x1, x9 +; CHECK-NEXT: ldp q1, q2, [x10] +; CHECK-NEXT: fcvtzs.4s v1, v1 +; CHECK-NEXT: fcvtzs.4s v2, v2 +; CHECK-NEXT: xtn.4h v1, v1 +; CHECK-NEXT: umov.h w10, v1[0] +; CHECK-NEXT: umov.h w11, v1[2] +; CHECK-NEXT: fmov s3, w10 +; CHECK-NEXT: umov.h w10, v1[3] +; CHECK-NEXT: xtn.4h v1, v2 +; CHECK-NEXT: ldp q2, q5, [x9] +; CHECK-NEXT: mov.b v3[2], w11 +; CHECK-NEXT: umov.h w11, v1[0] +; CHECK-NEXT: umov.h w9, v1[1] +; CHECK-NEXT: fcvtzs.4s v2, v2 +; CHECK-NEXT: mov.b v3[3], w10 +; CHECK-NEXT: umov.h w10, v1[2] +; CHECK-NEXT: xtn.4h v1, v2 +; CHECK-NEXT: fcvtzs.4s v2, v5 +; CHECK-NEXT: mov.b v3[4], w11 +; CHECK-NEXT: umov.h w11, v1[1] +; CHECK-NEXT: mov.b v3[5], w9 +; CHECK-NEXT: umov.h w9, v1[0] +; CHECK-NEXT: mov.b v3[6], w10 +; CHECK-NEXT: umov.h w10, v1[3] +; CHECK-NEXT: fmov s4, w9 +; CHECK-NEXT: umov.h w9, v1[2] +; CHECK-NEXT: xtn.4h v1, v2 +; CHECK-NEXT: mov.b v4[1], w11 +; CHECK-NEXT: mov.b v4[2], w9 +; CHECK-NEXT: umov.h w9, v1[0] +; CHECK-NEXT: mov.b v4[3], w10 +; CHECK-NEXT: umov.h w10, v1[2] +; CHECK-NEXT: mov.b v4[4], w9 +; CHECK-NEXT: umov.h w9, v1[3] +; CHECK-NEXT: mov.b v4[6], w10 +; CHECK-NEXT: mov.b v4[7], w9 +; CHECK-NEXT: tbl.16b v1, { v3, v4 }, v0 +; CHECK-NEXT: str q1, [x2, x8, lsl #4] +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: b.eq LBB3_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr inbounds <8 x float>, ptr %A, i64 %iv + %gep.B = getelementptr inbounds <8x float>, ptr %B, i64 %iv + %l.A = load <8 x float>, ptr %gep.A + %l.B = load <8 x float>, ptr %gep.B + %c1 = fptoui <8 x float> %l.A to <8 x i8> + %c2 = fptoui <8 x float> %l.B to <8 x i8> + %s = shufflevector <8 x i8> %c1, <8 x i8> %c2, <16 x i32> + %gep.dst = getelementptr inbounds <16 x i8>, ptr %dst, i64 %iv + store <16 x i8> %s, ptr %gep.dst + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + + +define void @fptoui_v16f32_to_v16i8_in_loop(ptr %A, ptr %dst) { +; CHECK-LABEL: fptoui_v16f32_to_v16i8_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB4_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x9, x0, x8, lsl #6 +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: ldp q0, q1, [x9, #32] +; CHECK-NEXT: fcvtzs.4s v0, v0 +; CHECK-NEXT: ldp q2, q3, [x9] +; CHECK-NEXT: fcvtzs.4s v1, v1 +; CHECK-NEXT: xtn.4h v0, v0 +; CHECK-NEXT: fcvtzs.4s v2, v2 +; CHECK-NEXT: xtn.4h v1, v1 +; CHECK-NEXT: fcvtzs.4s v3, v3 +; CHECK-NEXT: mov.d v0[1], v1[0] +; CHECK-NEXT: xtn.4h v2, v2 +; CHECK-NEXT: xtn.4h v3, v3 +; CHECK-NEXT: mov.d v2[1], v3[0] +; CHECK-NEXT: uzp1.16b v0, v2, v0 +; CHECK-NEXT: str q0, [x1], #32 +; CHECK-NEXT: b.eq LBB4_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr inbounds <16 x float>, ptr %A, i64 %iv + %l.A = load <16 x float>, ptr %gep.A + %c = fptoui <16 x float> %l.A to <16 x i8> + %gep.dst = getelementptr inbounds <32 x i8>, ptr %dst, i64 %iv + store <16 x i8> %c, ptr %gep.dst + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + +define void @fptoui_2x_v16f32_to_v16i8_in_loop(ptr %A, ptr %B, ptr %dst) { +; CHECK-LABEL: fptoui_2x_v16f32_to_v16i8_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB5_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: lsl x9, x8, #6 +; CHECK-NEXT: add x10, x0, x9 +; CHECK-NEXT: add x9, x1, x9 +; CHECK-NEXT: ldp q0, q1, [x10] +; CHECK-NEXT: fcvtzs.4s v0, v0 +; CHECK-NEXT: ldp q2, q3, [x10, #32] +; CHECK-NEXT: fcvtzs.4s v1, v1 +; CHECK-NEXT: xtn.4h v0, v0 +; CHECK-NEXT: fcvtzs.4s v2, v2 +; CHECK-NEXT: xtn.4h v1, v1 +; CHECK-NEXT: ldp q4, q5, [x9] +; CHECK-NEXT: fcvtzs.4s v3, v3 +; CHECK-NEXT: xtn.4h v2, v2 +; CHECK-NEXT: mov.d v0[1], v1[0] +; CHECK-NEXT: fcvtzs.4s v4, v4 +; CHECK-NEXT: xtn.4h v3, v3 +; CHECK-NEXT: ldp q6, q7, [x9, #32] +; CHECK-NEXT: fcvtzs.4s v5, v5 +; CHECK-NEXT: add x9, x2, x8, lsl #5 +; CHECK-NEXT: xtn.4h v4, v4 +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: mov.d v2[1], v3[0] +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: fcvtzs.4s v6, v6 +; CHECK-NEXT: xtn.4h v5, v5 +; CHECK-NEXT: fcvtzs.4s v7, v7 +; CHECK-NEXT: mov.d v4[1], v5[0] +; CHECK-NEXT: xtn.4h v6, v6 +; CHECK-NEXT: uzp1.16b v0, v0, v2 +; CHECK-NEXT: xtn.4h v7, v7 +; CHECK-NEXT: mov.d v6[1], v7[0] +; CHECK-NEXT: uzp1.16b v1, v4, v6 +; CHECK-NEXT: stp q0, q1, [x9] +; CHECK-NEXT: b.eq LBB5_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr inbounds <16 x float>, ptr %A, i64 %iv + %gep.B = getelementptr inbounds <16 x float>, ptr %B, i64 %iv + %l.A = load <16 x float>, ptr %gep.A + %l.B = load <16 x float>, ptr %gep.B + %c1 = fptoui <16 x float> %l.A to <16 x i8> + %c2 = fptoui <16 x float> %l.B to <16 x i8> + %s = shufflevector <16 x i8> %c1, <16 x i8> %c2, <32 x i32> + %gep.dst = getelementptr inbounds <32 x i8>, ptr %dst, i64 %iv + store <32 x i8> %s, ptr %gep.dst + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + +define void @fptoui_v8f32_to_v8i16_in_loop(ptr %A, ptr %dst) { +; CHECK-LABEL: fptoui_v8f32_to_v8i16_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB6_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x9, x0, x8, lsl #5 +; CHECK-NEXT: ldp q0, q1, [x9] +; CHECK-NEXT: fcvtzu.4s v0, v0 +; CHECK-NEXT: fcvtzu.4s v1, v1 +; CHECK-NEXT: uzp1.8h v0, v0, v1 +; CHECK-NEXT: str q0, [x1, x8, lsl #4] +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: b.eq LBB6_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr inbounds <8 x float>, ptr %A, i64 %iv + %l.A = load <8 x float>, ptr %gep.A + %c = fptoui <8 x float> %l.A to <8 x i16> + %gep.dst = getelementptr inbounds <8 x i16>, ptr %dst, i64 %iv + store <8 x i16> %c, ptr %gep.dst + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + +define void @fptoui_2x_v8f32_to_v8i16_in_loop(ptr %A, ptr %B, ptr %dst) { +; CHECK-LABEL: fptoui_2x_v8f32_to_v8i16_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB7_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: lsl x9, x8, #5 +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: add x10, x0, x9 +; CHECK-NEXT: add x11, x1, x9 +; CHECK-NEXT: add x9, x2, x9 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: ldp q0, q1, [x10] +; CHECK-NEXT: fcvtzu.4s v0, v0 +; CHECK-NEXT: ldp q2, q3, [x11] +; CHECK-NEXT: fcvtzu.4s v1, v1 +; CHECK-NEXT: fcvtzu.4s v2, v2 +; CHECK-NEXT: uzp1.8h v0, v0, v1 +; CHECK-NEXT: fcvtzu.4s v3, v3 +; CHECK-NEXT: uzp1.8h v1, v2, v3 +; CHECK-NEXT: stp q0, q1, [x9] +; CHECK-NEXT: b.eq LBB7_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr inbounds <8 x float>, ptr %A, i64 %iv + %gep.B = getelementptr inbounds <8 x float>, ptr %B, i64 %iv + %l.A = load <8 x float>, ptr %gep.A + %l.B = load <8 x float>, ptr %gep.B + %c1 = fptoui <8 x float> %l.A to <8 x i16> + %c2 = fptoui <8 x float> %l.B to <8 x i16> + %s = shufflevector <8 x i16> %c1, <8 x i16> %c2, <16 x i32> + %gep.dst = getelementptr inbounds <16 x i16>, ptr %dst, i64 %iv + store <16 x i16> %s, ptr %gep.dst + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + +define void @uitofp_v8i8_to_v8f32(ptr %src, ptr %dst) { +; CHECK-LABEL: uitofp_v8i8_to_v8f32: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB8_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr d0, [x0, x8, lsl #3] +; CHECK-NEXT: add x9, x1, x8, lsl #5 +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: zip1.8b v1, v0, v0 +; CHECK-NEXT: zip2.8b v0, v0, v0 +; CHECK-NEXT: bic.4h v1, #255, lsl #8 +; CHECK-NEXT: bic.4h v0, #255, lsl #8 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: ushll.4s v1, v1, #0 +; CHECK-NEXT: ucvtf.4s v0, v0 +; CHECK-NEXT: ucvtf.4s v1, v1 +; CHECK-NEXT: stp q1, q0, [x9] +; CHECK-NEXT: b.eq LBB8_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.src = getelementptr inbounds <8 x i8>, ptr %src, i64 %iv + %l = load <8 x i8>, ptr %gep.src + %conv = uitofp <8 x i8> %l to <8 x float> + %gep.dst = getelementptr inbounds <8 x float>, ptr %dst, i64 %iv + store <8 x float> %conv, ptr %gep.dst + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + +define void @uitofp_v16i8_to_v16f32(ptr %src, ptr %dst) { +; CHECK-LABEL: uitofp_v16i8_to_v16f32: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB9_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr q0, [x0, x8, lsl #4] +; CHECK-NEXT: add x9, x1, x8, lsl #6 +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: ext.16b v1, v0, v0, #8 +; CHECK-NEXT: zip1.8b v2, v0, v0 +; CHECK-NEXT: zip2.8b v0, v0, v0 +; CHECK-NEXT: bic.4h v2, #255, lsl #8 +; CHECK-NEXT: zip1.8b v3, v1, v0 +; CHECK-NEXT: zip2.8b v1, v1, v0 +; CHECK-NEXT: bic.4h v0, #255, lsl #8 +; CHECK-NEXT: ushll.4s v2, v2, #0 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: bic.4h v3, #255, lsl #8 +; CHECK-NEXT: bic.4h v1, #255, lsl #8 +; CHECK-NEXT: ucvtf.4s v2, v2 +; CHECK-NEXT: ushll.4s v1, v1, #0 +; CHECK-NEXT: ucvtf.4s v0, v0 +; CHECK-NEXT: ushll.4s v3, v3, #0 +; CHECK-NEXT: ucvtf.4s v1, v1 +; CHECK-NEXT: ucvtf.4s v3, v3 +; CHECK-NEXT: stp q2, q0, [x9] +; CHECK-NEXT: stp q3, q1, [x9, #32] +; CHECK-NEXT: b.eq LBB9_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.src = getelementptr inbounds <16 x i8>, ptr %src, i64 %iv + %l = load <16 x i8>, ptr %gep.src + %conv = uitofp <16 x i8> %l to <16 x float> + %gep.dst = getelementptr inbounds <16 x float>, ptr %dst, i64 %iv + store <16 x float> %conv, ptr %gep.dst + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} -- 2.7.4