From f15add7d93aeabbfa381499a04eae769cc1cf4f0 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Wed, 1 Jun 2022 16:08:19 -0700 Subject: [PATCH] [RISCV] Split fixed-vector-strided-load-store.ll so it can be autogened I've gotten tired of updating register allocation changes by hand, let's just autogen this even if we have to duplicate it. --- .../rvv/fixed-vector-strided-load-store-asm.ll | 769 +++++++++++++++++++++ .../RISCV/rvv/fixed-vector-strided-load-store.ll | 325 +-------- 2 files changed, 781 insertions(+), 313 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll new file mode 100644 index 0000000..430b364 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll @@ -0,0 +1,769 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v -riscv-v-vector-bits-min=256 | FileCheck %s + +%struct.foo = type { i32, i32, i32, i32 } + +; void gather(signed char * __restrict A, signed char * __restrict B) { +; for (int i = 0; i != 1024; ++i) +; A[i] += B[i * 5]; +; } +define void @gather(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) { +; +; CHECK-LABEL: gather: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: li a4, 5 +; CHECK-NEXT: li a5, 1024 +; CHECK-NEXT: .LBB0_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu +; CHECK-NEXT: vlse8.v v8, (a1), a4 +; CHECK-NEXT: add a6, a0, a2 +; CHECK-NEXT: vle8.v v9, (a6) +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vse8.v v8, (a6) +; CHECK-NEXT: addi a2, a2, 32 +; CHECK-NEXT: addi a1, a1, 160 +; CHECK-NEXT: bne a2, a5, .LBB0_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %0 = mul nuw nsw <32 x i64> %vec.ind, + %1 = getelementptr inbounds i8, i8* %B, <32 x i64> %0 + %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %1, i32 1, <32 x i1> , <32 x i8> undef) + %2 = getelementptr inbounds i8, i8* %A, i64 %index + %3 = bitcast i8* %2 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %3, align 1 + %4 = add <32 x i8> %wide.load, %wide.masked.gather + %5 = bitcast i8* %2 to <32 x i8>* + store <32 x i8> %4, <32 x i8>* %5, align 1 + %index.next = add nuw i64 %index, 32 + %vec.ind.next = add <32 x i64> %vec.ind, + %6 = icmp eq i64 %index.next, 1024 + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @gather_masked(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, <32 x i8> %maskedoff) { +; +; CHECK-LABEL: gather_masked: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: lui a3, 983765 +; CHECK-NEXT: addiw a3, a3, 873 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vmv.s.x v0, a3 +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: li a4, 5 +; CHECK-NEXT: li a5, 1024 +; CHECK-NEXT: .LBB1_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vlse8.v v9, (a1), a4, v0.t +; CHECK-NEXT: add a6, a0, a2 +; CHECK-NEXT: vle8.v v10, (a6) +; CHECK-NEXT: vadd.vv v9, v10, v9 +; CHECK-NEXT: vse8.v v9, (a6) +; CHECK-NEXT: addi a2, a2, 32 +; CHECK-NEXT: addi a1, a1, 160 +; CHECK-NEXT: bne a2, a5, .LBB1_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %0 = mul nuw nsw <32 x i64> %vec.ind, + %1 = getelementptr inbounds i8, i8* %B, <32 x i64> %0 + %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %1, i32 1, <32 x i1> , <32 x i8> %maskedoff) + %2 = getelementptr inbounds i8, i8* %A, i64 %index + %3 = bitcast i8* %2 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %3, align 1 + %4 = add <32 x i8> %wide.load, %wide.masked.gather + %5 = bitcast i8* %2 to <32 x i8>* + store <32 x i8> %4, <32 x i8>* %5, align 1 + %index.next = add nuw i64 %index, 32 + %vec.ind.next = add <32 x i64> %vec.ind, + %6 = icmp eq i64 %index.next, 1024 + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @gather_negative_stride(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) { +; +; CHECK-LABEL: gather_negative_stride: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: addi a1, a1, 155 +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: li a4, -5 +; CHECK-NEXT: li a5, 1024 +; CHECK-NEXT: .LBB2_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu +; CHECK-NEXT: vlse8.v v8, (a1), a4 +; CHECK-NEXT: add a6, a0, a2 +; CHECK-NEXT: vle8.v v9, (a6) +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vse8.v v8, (a6) +; CHECK-NEXT: addi a2, a2, 32 +; CHECK-NEXT: addi a1, a1, 160 +; CHECK-NEXT: bne a2, a5, .LBB2_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %0 = mul nuw nsw <32 x i64> %vec.ind, + %1 = getelementptr inbounds i8, i8* %B, <32 x i64> %0 + %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %1, i32 1, <32 x i1> , <32 x i8> undef) + %2 = getelementptr inbounds i8, i8* %A, i64 %index + %3 = bitcast i8* %2 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %3, align 1 + %4 = add <32 x i8> %wide.load, %wide.masked.gather + %5 = bitcast i8* %2 to <32 x i8>* + store <32 x i8> %4, <32 x i8>* %5, align 1 + %index.next = add nuw i64 %index, 32 + %vec.ind.next = add <32 x i64> %vec.ind, + %6 = icmp eq i64 %index.next, 1024 + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @gather_zero_stride(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) { +; +; CHECK-LABEL: gather_zero_stride: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: li a4, 1024 +; CHECK-NEXT: .LBB3_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu +; CHECK-NEXT: vlse8.v v8, (a1), zero +; CHECK-NEXT: add a5, a0, a2 +; CHECK-NEXT: vle8.v v9, (a5) +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vse8.v v8, (a5) +; CHECK-NEXT: addi a2, a2, 32 +; CHECK-NEXT: addi a1, a1, 160 +; CHECK-NEXT: bne a2, a4, .LBB3_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ] + %0 = mul nuw nsw <32 x i64> %vec.ind, + %1 = getelementptr inbounds i8, i8* %B, <32 x i64> %0 + %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %1, i32 1, <32 x i1> , <32 x i8> undef) + %2 = getelementptr inbounds i8, i8* %A, i64 %index + %3 = bitcast i8* %2 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %3, align 1 + %4 = add <32 x i8> %wide.load, %wide.masked.gather + %5 = bitcast i8* %2 to <32 x i8>* + store <32 x i8> %4, <32 x i8>* %5, align 1 + %index.next = add nuw i64 %index, 32 + %vec.ind.next = add <32 x i64> %vec.ind, + %6 = icmp eq i64 %index.next, 1024 + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +;void scatter(signed char * __restrict A, signed char * __restrict B) { +; for (int i = 0; i < 1024; ++i) +; A[i * 5] += B[i]; +;} +define void @scatter(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) { +; +; CHECK-LABEL: scatter: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: li a4, 5 +; CHECK-NEXT: li a5, 1024 +; CHECK-NEXT: .LBB4_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add a6, a1, a2 +; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu +; CHECK-NEXT: vle8.v v8, (a6) +; CHECK-NEXT: vlse8.v v9, (a0), a4 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsse8.v v8, (a0), a4 +; CHECK-NEXT: addi a2, a2, 32 +; CHECK-NEXT: addi a0, a0, 160 +; CHECK-NEXT: bne a2, a5, .LBB4_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %0 = getelementptr inbounds i8, i8* %B, i64 %index + %1 = bitcast i8* %0 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %1, align 1 + %2 = mul nuw nsw <32 x i64> %vec.ind, + %3 = getelementptr inbounds i8, i8* %A, <32 x i64> %2 + %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %3, i32 1, <32 x i1> , <32 x i8> undef) + %4 = add <32 x i8> %wide.masked.gather, %wide.load + call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> %4, <32 x i8*> %3, i32 1, <32 x i1> ) + %index.next = add nuw i64 %index, 32 + %vec.ind.next = add <32 x i64> %vec.ind, + %5 = icmp eq i64 %index.next, 1024 + br i1 %5, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @scatter_masked(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, <32 x i8> %maskedoff) { +; +; CHECK-LABEL: scatter_masked: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: lui a4, 983765 +; CHECK-NEXT: addiw a4, a4, 873 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vmv.s.x v0, a4 +; CHECK-NEXT: li a4, 5 +; CHECK-NEXT: li a5, 1024 +; CHECK-NEXT: .LBB5_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add a6, a1, a2 +; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu +; CHECK-NEXT: vle8.v v9, (a6) +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vlse8.v v10, (a0), a4, v0.t +; CHECK-NEXT: vadd.vv v9, v10, v9 +; CHECK-NEXT: vsse8.v v9, (a0), a4, v0.t +; CHECK-NEXT: addi a2, a2, 32 +; CHECK-NEXT: addi a0, a0, 160 +; CHECK-NEXT: bne a2, a5, .LBB5_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %0 = getelementptr inbounds i8, i8* %B, i64 %index + %1 = bitcast i8* %0 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %1, align 1 + %2 = mul nuw nsw <32 x i64> %vec.ind, + %3 = getelementptr inbounds i8, i8* %A, <32 x i64> %2 + %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %3, i32 1, <32 x i1> , <32 x i8> %maskedoff) + %4 = add <32 x i8> %wide.masked.gather, %wide.load + call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> %4, <32 x i8*> %3, i32 1, <32 x i1> ) + %index.next = add nuw i64 %index, 32 + %vec.ind.next = add <32 x i64> %vec.ind, + %5 = icmp eq i64 %index.next, 1024 + br i1 %5, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +; void gather_pow2(signed char * __restrict A, signed char * __restrict B) { +; for (int i = 0; i != 1024; ++i) +; A[i] += B[i * 4]; +; } +define void @gather_pow2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B) { +; +; CHECK-LABEL: gather_pow2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: li a3, 16 +; CHECK-NEXT: li a4, 32 +; CHECK-NEXT: .LBB6_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, mu +; CHECK-NEXT: vlse32.v v8, (a1), a3 +; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, mu +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, mu +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, mu +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: addi a2, a2, -8 +; CHECK-NEXT: addi a0, a0, 32 +; CHECK-NEXT: addi a1, a1, 128 +; CHECK-NEXT: bnez a2, .LBB6_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <8 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %0 = shl nsw <8 x i64> %vec.ind, + %1 = getelementptr inbounds i32, i32* %B, <8 x i64> %0 + %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %1, i32 4, <8 x i1> , <8 x i32> undef) + %2 = getelementptr inbounds i32, i32* %A, i64 %index + %3 = bitcast i32* %2 to <8 x i32>* + %wide.load = load <8 x i32>, <8 x i32>* %3, align 1 + %4 = add <8 x i32> %wide.load, %wide.masked.gather + %5 = bitcast i32* %2 to <8 x i32>* + store <8 x i32> %4, <8 x i32>* %5, align 1 + %index.next = add nuw i64 %index, 8 + %vec.ind.next = add <8 x i64> %vec.ind, + %6 = icmp eq i64 %index.next, 1024 + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +;void scatter_pow2(signed char * __restrict A, signed char * __restrict B) { +; for (int i = 0; i < 1024; ++i) +; A[i * 4] += B[i]; +;} +define void @scatter_pow2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B) { +; +; CHECK-LABEL: scatter_pow2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: li a4, 16 +; CHECK-NEXT: .LBB7_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, mu +; CHECK-NEXT: vlse32.v v9, (a0), a4 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsse32.v v8, (a0), a4 +; CHECK-NEXT: addi a2, a2, -8 +; CHECK-NEXT: addi a1, a1, 32 +; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: bnez a2, .LBB7_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <8 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %B, i64 %index + %1 = bitcast i32* %0 to <8 x i32>* + %wide.load = load <8 x i32>, <8 x i32>* %1, align 1 + %2 = shl nuw nsw <8 x i64> %vec.ind, + %3 = getelementptr inbounds i32, i32* %A, <8 x i64> %2 + %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %3, i32 4, <8 x i1> , <8 x i32> undef) + %4 = add <8 x i32> %wide.masked.gather, %wide.load + call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %4, <8 x i32*> %3, i32 4, <8 x i1> ) + %index.next = add nuw i64 %index, 8 + %vec.ind.next = add <8 x i64> %vec.ind, + %5 = icmp eq i64 %index.next, 1024 + br i1 %5, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +;struct foo { +; int a, b, c, d; +;}; +; +;void struct_gather(int * __restrict A, struct foo * __restrict B) { +; for (int i = 0; i < 1024; ++i) +; A[i] += B[i].b; +;} +define void @struct_gather(i32* noalias nocapture %A, %struct.foo* noalias nocapture readonly %B) { +; +; CHECK-LABEL: struct_gather: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi a1, a1, 132 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: li a3, 16 +; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, mu +; CHECK-NEXT: .LBB8_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: addi a4, a1, -128 +; CHECK-NEXT: vlse32.v v8, (a4), a3 +; CHECK-NEXT: vlse32.v v9, (a1), a3 +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: addi a4, a0, 32 +; CHECK-NEXT: vle32.v v11, (a4) +; CHECK-NEXT: vadd.vv v8, v10, v8 +; CHECK-NEXT: vadd.vv v9, v11, v9 +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vse32.v v9, (a4) +; CHECK-NEXT: addi a2, a2, -16 +; CHECK-NEXT: addi a0, a0, 64 +; CHECK-NEXT: addi a1, a1, 256 +; CHECK-NEXT: bnez a2, .LBB8_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <8 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %step.add = add <8 x i64> %vec.ind, + %0 = getelementptr inbounds %struct.foo, %struct.foo* %B, <8 x i64> %vec.ind, i32 1 + %1 = getelementptr inbounds %struct.foo, %struct.foo* %B, <8 x i64> %step.add, i32 1 + %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %0, i32 4, <8 x i1> , <8 x i32> undef) + %wide.masked.gather9 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %1, i32 4, <8 x i1> , <8 x i32> undef) + %2 = getelementptr inbounds i32, i32* %A, i64 %index + %3 = bitcast i32* %2 to <8 x i32>* + %wide.load = load <8 x i32>, <8 x i32>* %3, align 4 + %4 = getelementptr inbounds i32, i32* %2, i64 8 + %5 = bitcast i32* %4 to <8 x i32>* + %wide.load10 = load <8 x i32>, <8 x i32>* %5, align 4 + %6 = add nsw <8 x i32> %wide.load, %wide.masked.gather + %7 = add nsw <8 x i32> %wide.load10, %wide.masked.gather9 + %8 = bitcast i32* %2 to <8 x i32>* + store <8 x i32> %6, <8 x i32>* %8, align 4 + %9 = bitcast i32* %4 to <8 x i32>* + store <8 x i32> %7, <8 x i32>* %9, align 4 + %index.next = add nuw i64 %index, 16 + %vec.ind.next = add <8 x i64> %vec.ind, + %10 = icmp eq i64 %index.next, 1024 + br i1 %10, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +;void gather_unroll(int * __restrict A, int * __restrict B) { +; for (int i = 0; i < 1024; i+= 4 ) { +; A[i] += B[i * 4]; +; A[i+1] += B[(i+1) * 4]; +; A[i+2] += B[(i+2) * 4]; +; A[i+3] += B[(i+3) * 4]; +; } +;} +define void @gather_unroll(i32* noalias nocapture %A, i32* noalias nocapture readonly %B) { +; +; CHECK-LABEL: gather_unroll: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a2, 256 +; CHECK-NEXT: li a3, 64 +; CHECK-NEXT: li a4, 16 +; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, mu +; CHECK-NEXT: .LBB9_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vlse32.v v8, (a1), a3 +; CHECK-NEXT: vlse32.v v9, (a0), a4 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsse32.v v8, (a0), a4 +; CHECK-NEXT: addi a5, a1, 16 +; CHECK-NEXT: vlse32.v v8, (a5), a3 +; CHECK-NEXT: addi a5, a0, 4 +; CHECK-NEXT: vlse32.v v9, (a5), a4 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsse32.v v8, (a5), a4 +; CHECK-NEXT: addi a5, a1, 32 +; CHECK-NEXT: vlse32.v v8, (a5), a3 +; CHECK-NEXT: addi a5, a0, 8 +; CHECK-NEXT: vlse32.v v9, (a5), a4 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsse32.v v8, (a5), a4 +; CHECK-NEXT: addi a5, a1, 48 +; CHECK-NEXT: vlse32.v v8, (a5), a3 +; CHECK-NEXT: addi a5, a0, 12 +; CHECK-NEXT: vlse32.v v9, (a5), a4 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsse32.v v8, (a5), a4 +; CHECK-NEXT: addi a2, a2, -8 +; CHECK-NEXT: addi a1, a1, 512 +; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: bnez a2, .LBB9_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <8 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %0 = shl nuw nsw <8 x i64> %vec.ind, + %1 = getelementptr inbounds i32, i32* %B, <8 x i64> %0 + %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %1, i32 4, <8 x i1> , <8 x i32> undef) + %2 = getelementptr inbounds i32, i32* %A, <8 x i64> %vec.ind + %wide.masked.gather52 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %2, i32 4, <8 x i1> , <8 x i32> undef) + %3 = add nsw <8 x i32> %wide.masked.gather52, %wide.masked.gather + call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %3, <8 x i32*> %2, i32 4, <8 x i1> ) + %4 = or <8 x i64> %vec.ind, + %5 = shl nsw <8 x i64> %4, + %6 = getelementptr inbounds i32, i32* %B, <8 x i64> %5 + %wide.masked.gather53 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %6, i32 4, <8 x i1> , <8 x i32> undef) + %7 = getelementptr inbounds i32, i32* %A, <8 x i64> %4 + %wide.masked.gather54 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %7, i32 4, <8 x i1> , <8 x i32> undef) + %8 = add nsw <8 x i32> %wide.masked.gather54, %wide.masked.gather53 + call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %8, <8 x i32*> %7, i32 4, <8 x i1> ) + %9 = or <8 x i64> %vec.ind, + %10 = shl nsw <8 x i64> %9, + %11 = getelementptr inbounds i32, i32* %B, <8 x i64> %10 + %wide.masked.gather55 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %11, i32 4, <8 x i1> , <8 x i32> undef) + %12 = getelementptr inbounds i32, i32* %A, <8 x i64> %9 + %wide.masked.gather56 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %12, i32 4, <8 x i1> , <8 x i32> undef) + %13 = add nsw <8 x i32> %wide.masked.gather56, %wide.masked.gather55 + call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %13, <8 x i32*> %12, i32 4, <8 x i1> ) + %14 = or <8 x i64> %vec.ind, + %15 = shl nsw <8 x i64> %14, + %16 = getelementptr inbounds i32, i32* %B, <8 x i64> %15 + %wide.masked.gather57 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %16, i32 4, <8 x i1> , <8 x i32> undef) + %17 = getelementptr inbounds i32, i32* %A, <8 x i64> %14 + %wide.masked.gather58 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %17, i32 4, <8 x i1> , <8 x i32> undef) + %18 = add nsw <8 x i32> %wide.masked.gather58, %wide.masked.gather57 + call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %18, <8 x i32*> %17, i32 4, <8 x i1> ) + %index.next = add nuw i64 %index, 8 + %vec.ind.next = add <8 x i64> %vec.ind, + %19 = icmp eq i64 %index.next, 256 + br i1 %19, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*>, i32 immarg, <32 x i1>, <32 x i8>) +declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>, i32 immarg, <8 x i1>, <8 x i32>) +declare void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8>, <32 x i8*>, i32 immarg, <32 x i1>) +declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32>, <8 x i32*>, i32 immarg, <8 x i1>) + +; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers. +define void @gather_of_pointers(i32** noalias nocapture %0, i32** noalias nocapture readonly %1) { +; +; CHECK-LABEL: gather_of_pointers: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: li a3, 40 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vlse64.v v8, (a1), a3 +; CHECK-NEXT: addi a4, a1, 80 +; CHECK-NEXT: vlse64.v v9, (a4), a3 +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: addi a4, a0, 16 +; CHECK-NEXT: vse64.v v9, (a4) +; CHECK-NEXT: addi a2, a2, -4 +; CHECK-NEXT: addi a0, a0, 32 +; CHECK-NEXT: addi a1, a1, 160 +; CHECK-NEXT: bnez a2, .LBB10_1 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: ret + br label %3 + +3: ; preds = %3, %2 + %4 = phi i64 [ 0, %2 ], [ %17, %3 ] + %5 = phi <2 x i64> [ , %2 ], [ %18, %3 ] + %6 = mul nuw nsw <2 x i64> %5, + %7 = mul <2 x i64> %5, + %8 = add <2 x i64> %7, + %9 = getelementptr inbounds i32*, i32** %1, <2 x i64> %6 + %10 = getelementptr inbounds i32*, i32** %1, <2 x i64> %8 + %11 = call <2 x i32*> @llvm.masked.gather.v2p0i32.v2p0p0i32(<2 x i32**> %9, i32 8, <2 x i1> , <2 x i32*> undef) + %12 = call <2 x i32*> @llvm.masked.gather.v2p0i32.v2p0p0i32(<2 x i32**> %10, i32 8, <2 x i1> , <2 x i32*> undef) + %13 = getelementptr inbounds i32*, i32** %0, i64 %4 + %14 = bitcast i32** %13 to <2 x i32*>* + store <2 x i32*> %11, <2 x i32*>* %14, align 8 + %15 = getelementptr inbounds i32*, i32** %13, i64 2 + %16 = bitcast i32** %15 to <2 x i32*>* + store <2 x i32*> %12, <2 x i32*>* %16, align 8 + %17 = add nuw i64 %4, 4 + %18 = add <2 x i64> %5, + %19 = icmp eq i64 %17, 1024 + br i1 %19, label %20, label %3 + +20: ; preds = %3 + ret void +} + +declare <2 x i32*> @llvm.masked.gather.v2p0i32.v2p0p0i32(<2 x i32**>, i32 immarg, <2 x i1>, <2 x i32*>) + +; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers. +define void @scatter_of_pointers(i32** noalias nocapture %0, i32** noalias nocapture readonly %1) { +; +; CHECK-LABEL: scatter_of_pointers: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: li a3, 40 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: addi a4, a1, 16 +; CHECK-NEXT: vle64.v v9, (a4) +; CHECK-NEXT: addi a4, a0, 80 +; CHECK-NEXT: vsse64.v v8, (a0), a3 +; CHECK-NEXT: vsse64.v v9, (a4), a3 +; CHECK-NEXT: addi a2, a2, -4 +; CHECK-NEXT: addi a1, a1, 32 +; CHECK-NEXT: addi a0, a0, 160 +; CHECK-NEXT: bnez a2, .LBB11_1 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: ret + br label %3 + +3: ; preds = %3, %2 + %4 = phi i64 [ 0, %2 ], [ %17, %3 ] + %5 = phi <2 x i64> [ , %2 ], [ %18, %3 ] + %6 = getelementptr inbounds i32*, i32** %1, i64 %4 + %7 = bitcast i32** %6 to <2 x i32*>* + %8 = load <2 x i32*>, <2 x i32*>* %7, align 8 + %9 = getelementptr inbounds i32*, i32** %6, i64 2 + %10 = bitcast i32** %9 to <2 x i32*>* + %11 = load <2 x i32*>, <2 x i32*>* %10, align 8 + %12 = mul nuw nsw <2 x i64> %5, + %13 = mul <2 x i64> %5, + %14 = add <2 x i64> %13, + %15 = getelementptr inbounds i32*, i32** %0, <2 x i64> %12 + %16 = getelementptr inbounds i32*, i32** %0, <2 x i64> %14 + call void @llvm.masked.scatter.v2p0i32.v2p0p0i32(<2 x i32*> %8, <2 x i32**> %15, i32 8, <2 x i1> ) + call void @llvm.masked.scatter.v2p0i32.v2p0p0i32(<2 x i32*> %11, <2 x i32**> %16, i32 8, <2 x i1> ) + %17 = add nuw i64 %4, 4 + %18 = add <2 x i64> %5, + %19 = icmp eq i64 %17, 1024 + br i1 %19, label %20, label %3 + +20: ; preds = %3 + ret void +} + +declare void @llvm.masked.scatter.v2p0i32.v2p0p0i32(<2 x i32*>, <2 x i32**>, i32 immarg, <2 x i1>) + +define void @strided_load_startval_add_with_splat(i8* noalias nocapture %0, i8* noalias nocapture readonly %1, i32 signext %2) { +; +; CHECK-LABEL: strided_load_startval_add_with_splat: +; CHECK: # %bb.0: +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: beq a2, a3, .LBB12_7 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a3, 1023 +; CHECK-NEXT: subw a4, a3, a2 +; CHECK-NEXT: li a5, 31 +; CHECK-NEXT: mv a3, a2 +; CHECK-NEXT: bltu a4, a5, .LBB12_5 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: slli a3, a4, 32 +; CHECK-NEXT: srli a3, a3, 32 +; CHECK-NEXT: addi a4, a3, 1 +; CHECK-NEXT: andi a5, a4, -32 +; CHECK-NEXT: add a3, a5, a2 +; CHECK-NEXT: slli a6, a2, 2 +; CHECK-NEXT: add a6, a6, a2 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: add a6, a1, a6 +; CHECK-NEXT: li a7, 32 +; CHECK-NEXT: li t0, 5 +; CHECK-NEXT: mv t1, a5 +; CHECK-NEXT: .LBB12_3: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli zero, a7, e8, m1, ta, mu +; CHECK-NEXT: vlse8.v v8, (a6), t0 +; CHECK-NEXT: vle8.v v9, (a2) +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vse8.v v8, (a2) +; CHECK-NEXT: addi t1, t1, -32 +; CHECK-NEXT: addi a2, a2, 32 +; CHECK-NEXT: addi a6, a6, 160 +; CHECK-NEXT: bnez t1, .LBB12_3 +; CHECK-NEXT: # %bb.4: +; CHECK-NEXT: beq a4, a5, .LBB12_7 +; CHECK-NEXT: .LBB12_5: +; CHECK-NEXT: slli a2, a3, 2 +; CHECK-NEXT: add a2, a2, a3 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: .LBB12_6: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: lb a4, 0(a1) +; CHECK-NEXT: add a5, a0, a3 +; CHECK-NEXT: lb a6, 0(a5) +; CHECK-NEXT: addw a4, a6, a4 +; CHECK-NEXT: sb a4, 0(a5) +; CHECK-NEXT: addiw a4, a3, 1 +; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: addi a1, a1, 5 +; CHECK-NEXT: bne a4, a2, .LBB12_6 +; CHECK-NEXT: .LBB12_7: +; CHECK-NEXT: ret + %4 = icmp eq i32 %2, 1024 + br i1 %4, label %36, label %5 + +5: ; preds = %3 + %6 = sext i32 %2 to i64 + %7 = sub i32 1023, %2 + %8 = zext i32 %7 to i64 + %9 = add nuw nsw i64 %8, 1 + %10 = icmp ult i32 %7, 31 + br i1 %10, label %34, label %11 + +11: ; preds = %5 + %12 = and i64 %9, 8589934560 + %13 = add nsw i64 %12, %6 + %14 = insertelement <32 x i64> poison, i64 %6, i64 0 + %15 = shufflevector <32 x i64> %14, <32 x i64> poison, <32 x i32> zeroinitializer + %16 = add <32 x i64> %15, + br label %17 + +17: ; preds = %17, %11 + %18 = phi i64 [ 0, %11 ], [ %29, %17 ] + %19 = phi <32 x i64> [ %16, %11 ], [ %30, %17 ] + %20 = add i64 %18, %6 + %21 = mul nsw <32 x i64> %19, + %22 = getelementptr inbounds i8, i8* %1, <32 x i64> %21 + %23 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %22, i32 1, <32 x i1> , <32 x i8> undef) + %24 = getelementptr inbounds i8, i8* %0, i64 %20 + %25 = bitcast i8* %24 to <32 x i8>* + %26 = load <32 x i8>, <32 x i8>* %25, align 1 + %27 = add <32 x i8> %26, %23 + %28 = bitcast i8* %24 to <32 x i8>* + store <32 x i8> %27, <32 x i8>* %28, align 1 + %29 = add nuw i64 %18, 32 + %30 = add <32 x i64> %19, + %31 = icmp eq i64 %29, %12 + br i1 %31, label %32, label %17 + +32: ; preds = %17 + %33 = icmp eq i64 %9, %12 + br i1 %33, label %36, label %34 + +34: ; preds = %5, %32 + %35 = phi i64 [ %6, %5 ], [ %13, %32 ] + br label %37 + +36: ; preds = %37, %32, %3 + ret void + +37: ; preds = %34, %37 + %38 = phi i64 [ %45, %37 ], [ %35, %34 ] + %39 = mul nsw i64 %38, 5 + %40 = getelementptr inbounds i8, i8* %1, i64 %39 + %41 = load i8, i8* %40, align 1 + %42 = getelementptr inbounds i8, i8* %0, i64 %38 + %43 = load i8, i8* %42, align 1 + %44 = add i8 %43, %41 + store i8 %44, i8* %42, align 1 + %45 = add nsw i64 %38, 1 + %46 = trunc i64 %45 to i32 + %47 = icmp eq i32 %46, 1024 + br i1 %47, label %36, label %37 +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll index 22eef61..e5b48c1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll @@ -1,5 +1,5 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt %s -S -riscv-gather-scatter-lowering -mtriple=riscv64 -mattr=+m,+v -riscv-v-vector-bits-min=256 | FileCheck %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefix=CHECK-ASM %struct.foo = type { i32, i32, i32, i32 } @@ -29,25 +29,6 @@ define void @gather(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; -; CHECK-ASM-LABEL: gather: -; CHECK-ASM: # %bb.0: # %entry -; CHECK-ASM-NEXT: li a2, 0 -; CHECK-ASM-NEXT: li a3, 32 -; CHECK-ASM-NEXT: li a4, 5 -; CHECK-ASM-NEXT: li a5, 1024 -; CHECK-ASM-NEXT: .LBB0_1: # %vector.body -; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-ASM-NEXT: vsetvli zero, a3, e8, m1, ta, mu -; CHECK-ASM-NEXT: vlse8.v v8, (a1), a4 -; CHECK-ASM-NEXT: add a6, a0, a2 -; CHECK-ASM-NEXT: vle8.v v9, (a6) -; CHECK-ASM-NEXT: vadd.vv v8, v9, v8 -; CHECK-ASM-NEXT: vse8.v v8, (a6) -; CHECK-ASM-NEXT: addi a2, a2, 32 -; CHECK-ASM-NEXT: addi a1, a1, 160 -; CHECK-ASM-NEXT: bne a2, a5, .LBB0_1 -; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup -; CHECK-ASM-NEXT: ret entry: br label %vector.body @@ -94,30 +75,6 @@ define void @gather_masked(i8* noalias nocapture %A, i8* noalias nocapture reado ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; -; CHECK-ASM-LABEL: gather_masked: -; CHECK-ASM: # %bb.0: # %entry -; CHECK-ASM-NEXT: li a2, 0 -; CHECK-ASM-NEXT: lui a3, 983765 -; CHECK-ASM-NEXT: addiw a3, a3, 873 -; CHECK-ASM-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; CHECK-ASM-NEXT: vmv.s.x v0, a3 -; CHECK-ASM-NEXT: li a3, 32 -; CHECK-ASM-NEXT: li a4, 5 -; CHECK-ASM-NEXT: li a5, 1024 -; CHECK-ASM-NEXT: .LBB1_1: # %vector.body -; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-ASM-NEXT: vsetvli zero, a3, e8, m1, ta, mu -; CHECK-ASM-NEXT: vmv1r.v v9, v8 -; CHECK-ASM-NEXT: vlse8.v v9, (a1), a4, v0.t -; CHECK-ASM-NEXT: add a6, a0, a2 -; CHECK-ASM-NEXT: vle8.v v10, (a6) -; CHECK-ASM-NEXT: vadd.vv v9, v10, v9 -; CHECK-ASM-NEXT: vse8.v v9, (a6) -; CHECK-ASM-NEXT: addi a2, a2, 32 -; CHECK-ASM-NEXT: addi a1, a1, 160 -; CHECK-ASM-NEXT: bne a2, a5, .LBB1_1 -; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup -; CHECK-ASM-NEXT: ret entry: br label %vector.body @@ -143,6 +100,7 @@ for.cond.cleanup: ; preds = %vector.body } define void @gather_negative_stride(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) { +; ; CHECK-LABEL: @gather_negative_stride( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -164,26 +122,6 @@ define void @gather_negative_stride(i8* noalias nocapture %A, i8* noalias nocapt ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; -; CHECK-ASM-LABEL: gather_negative_stride: -; CHECK-ASM: # %bb.0: # %entry -; CHECK-ASM-NEXT: li a2, 0 -; CHECK-ASM-NEXT: addi a1, a1, 155 -; CHECK-ASM-NEXT: li a3, 32 -; CHECK-ASM-NEXT: li a4, -5 -; CHECK-ASM-NEXT: li a5, 1024 -; CHECK-ASM-NEXT: .LBB2_1: # %vector.body -; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-ASM-NEXT: vsetvli zero, a3, e8, m1, ta, mu -; CHECK-ASM-NEXT: vlse8.v v8, (a1), a4 -; CHECK-ASM-NEXT: add a6, a0, a2 -; CHECK-ASM-NEXT: vle8.v v9, (a6) -; CHECK-ASM-NEXT: vadd.vv v8, v9, v8 -; CHECK-ASM-NEXT: vse8.v v8, (a6) -; CHECK-ASM-NEXT: addi a2, a2, 32 -; CHECK-ASM-NEXT: addi a1, a1, 160 -; CHECK-ASM-NEXT: bne a2, a5, .LBB2_1 -; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup -; CHECK-ASM-NEXT: ret entry: br label %vector.body @@ -209,6 +147,7 @@ for.cond.cleanup: ; preds = %vector.body } define void @gather_zero_stride(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) { +; ; CHECK-LABEL: @gather_zero_stride( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -230,24 +169,6 @@ define void @gather_zero_stride(i8* noalias nocapture %A, i8* noalias nocapture ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; -; CHECK-ASM-LABEL: gather_zero_stride: -; CHECK-ASM: # %bb.0: # %entry -; CHECK-ASM-NEXT: li a2, 0 -; CHECK-ASM-NEXT: li a3, 32 -; CHECK-ASM-NEXT: li a4, 1024 -; CHECK-ASM-NEXT: .LBB3_1: # %vector.body -; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-ASM-NEXT: vsetvli zero, a3, e8, m1, ta, mu -; CHECK-ASM-NEXT: vlse8.v v8, (a1), zero -; CHECK-ASM-NEXT: add a5, a0, a2 -; CHECK-ASM-NEXT: vle8.v v9, (a5) -; CHECK-ASM-NEXT: vadd.vv v8, v9, v8 -; CHECK-ASM-NEXT: vse8.v v8, (a5) -; CHECK-ASM-NEXT: addi a2, a2, 32 -; CHECK-ASM-NEXT: addi a1, a1, 160 -; CHECK-ASM-NEXT: bne a2, a4, .LBB3_1 -; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup -; CHECK-ASM-NEXT: ret entry: br label %vector.body @@ -277,6 +198,7 @@ for.cond.cleanup: ; preds = %vector.body ; A[i * 5] += B[i]; ;} define void @scatter(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) { +; ; CHECK-LABEL: @scatter( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -297,25 +219,6 @@ define void @scatter(i8* noalias nocapture %A, i8* noalias nocapture readonly %B ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; -; CHECK-ASM-LABEL: scatter: -; CHECK-ASM: # %bb.0: # %entry -; CHECK-ASM-NEXT: li a2, 0 -; CHECK-ASM-NEXT: li a3, 32 -; CHECK-ASM-NEXT: li a4, 5 -; CHECK-ASM-NEXT: li a5, 1024 -; CHECK-ASM-NEXT: .LBB4_1: # %vector.body -; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-ASM-NEXT: add a6, a1, a2 -; CHECK-ASM-NEXT: vsetvli zero, a3, e8, m1, ta, mu -; CHECK-ASM-NEXT: vle8.v v8, (a6) -; CHECK-ASM-NEXT: vlse8.v v9, (a0), a4 -; CHECK-ASM-NEXT: vadd.vv v8, v9, v8 -; CHECK-ASM-NEXT: vsse8.v v8, (a0), a4 -; CHECK-ASM-NEXT: addi a2, a2, 32 -; CHECK-ASM-NEXT: addi a0, a0, 160 -; CHECK-ASM-NEXT: bne a2, a5, .LBB4_1 -; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup -; CHECK-ASM-NEXT: ret entry: br label %vector.body @@ -340,6 +243,7 @@ for.cond.cleanup: ; preds = %vector.body } define void @scatter_masked(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, <32 x i8> %maskedoff) { +; ; CHECK-LABEL: @scatter_masked( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -360,30 +264,6 @@ define void @scatter_masked(i8* noalias nocapture %A, i8* noalias nocapture read ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; -; CHECK-ASM-LABEL: scatter_masked: -; CHECK-ASM: # %bb.0: # %entry -; CHECK-ASM-NEXT: li a2, 0 -; CHECK-ASM-NEXT: li a3, 32 -; CHECK-ASM-NEXT: lui a4, 983765 -; CHECK-ASM-NEXT: addiw a4, a4, 873 -; CHECK-ASM-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; CHECK-ASM-NEXT: vmv.s.x v0, a4 -; CHECK-ASM-NEXT: li a4, 5 -; CHECK-ASM-NEXT: li a5, 1024 -; CHECK-ASM-NEXT: .LBB5_1: # %vector.body -; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-ASM-NEXT: add a6, a1, a2 -; CHECK-ASM-NEXT: vsetvli zero, a3, e8, m1, ta, mu -; CHECK-ASM-NEXT: vle8.v v9, (a6) -; CHECK-ASM-NEXT: vmv1r.v v10, v8 -; CHECK-ASM-NEXT: vlse8.v v10, (a0), a4, v0.t -; CHECK-ASM-NEXT: vadd.vv v9, v10, v9 -; CHECK-ASM-NEXT: vsse8.v v9, (a0), a4, v0.t -; CHECK-ASM-NEXT: addi a2, a2, 32 -; CHECK-ASM-NEXT: addi a0, a0, 160 -; CHECK-ASM-NEXT: bne a2, a5, .LBB5_1 -; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup -; CHECK-ASM-NEXT: ret entry: br label %vector.body @@ -412,6 +292,7 @@ for.cond.cleanup: ; preds = %vector.body ; A[i] += B[i * 4]; ; } define void @gather_pow2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B) { +; ; CHECK-LABEL: @gather_pow2( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -433,27 +314,6 @@ define void @gather_pow2(i32* noalias nocapture %A, i32* noalias nocapture reado ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; -; CHECK-ASM-LABEL: gather_pow2: -; CHECK-ASM: # %bb.0: # %entry -; CHECK-ASM-NEXT: li a2, 1024 -; CHECK-ASM-NEXT: li a3, 16 -; CHECK-ASM-NEXT: li a4, 32 -; CHECK-ASM-NEXT: .LBB6_1: # %vector.body -; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-ASM-NEXT: vsetivli zero, 8, e32, m1, ta, mu -; CHECK-ASM-NEXT: vlse32.v v8, (a1), a3 -; CHECK-ASM-NEXT: vsetvli zero, a4, e8, m1, ta, mu -; CHECK-ASM-NEXT: vle8.v v9, (a0) -; CHECK-ASM-NEXT: vsetivli zero, 8, e32, m1, ta, mu -; CHECK-ASM-NEXT: vadd.vv v8, v9, v8 -; CHECK-ASM-NEXT: vsetvli zero, a4, e8, m1, ta, mu -; CHECK-ASM-NEXT: vse8.v v8, (a0) -; CHECK-ASM-NEXT: addi a2, a2, -8 -; CHECK-ASM-NEXT: addi a0, a0, 32 -; CHECK-ASM-NEXT: addi a1, a1, 128 -; CHECK-ASM-NEXT: bnez a2, .LBB6_1 -; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup -; CHECK-ASM-NEXT: ret entry: br label %vector.body @@ -483,6 +343,7 @@ for.cond.cleanup: ; preds = %vector.body ; A[i * 4] += B[i]; ;} define void @scatter_pow2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B) { +; ; CHECK-LABEL: @scatter_pow2( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -503,25 +364,6 @@ define void @scatter_pow2(i32* noalias nocapture %A, i32* noalias nocapture read ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; -; CHECK-ASM-LABEL: scatter_pow2: -; CHECK-ASM: # %bb.0: # %entry -; CHECK-ASM-NEXT: li a2, 1024 -; CHECK-ASM-NEXT: li a3, 32 -; CHECK-ASM-NEXT: li a4, 16 -; CHECK-ASM-NEXT: .LBB7_1: # %vector.body -; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-ASM-NEXT: vsetvli zero, a3, e8, m1, ta, mu -; CHECK-ASM-NEXT: vle8.v v8, (a1) -; CHECK-ASM-NEXT: vsetivli zero, 8, e32, m1, ta, mu -; CHECK-ASM-NEXT: vlse32.v v9, (a0), a4 -; CHECK-ASM-NEXT: vadd.vv v8, v9, v8 -; CHECK-ASM-NEXT: vsse32.v v8, (a0), a4 -; CHECK-ASM-NEXT: addi a2, a2, -8 -; CHECK-ASM-NEXT: addi a1, a1, 32 -; CHECK-ASM-NEXT: addi a0, a0, 128 -; CHECK-ASM-NEXT: bnez a2, .LBB7_1 -; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup -; CHECK-ASM-NEXT: ret entry: br label %vector.body @@ -554,6 +396,7 @@ for.cond.cleanup: ; preds = %vector.body ; A[i] += B[i].b; ;} define void @struct_gather(i32* noalias nocapture %A, %struct.foo* noalias nocapture readonly %B) { +; ; CHECK-LABEL: @struct_gather( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -585,30 +428,6 @@ define void @struct_gather(i32* noalias nocapture %A, %struct.foo* noalias nocap ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; -; CHECK-ASM-LABEL: struct_gather: -; CHECK-ASM: # %bb.0: # %entry -; CHECK-ASM-NEXT: addi a1, a1, 132 -; CHECK-ASM-NEXT: li a2, 1024 -; CHECK-ASM-NEXT: li a3, 16 -; CHECK-ASM-NEXT: vsetivli zero, 8, e32, m1, ta, mu -; CHECK-ASM-NEXT: .LBB8_1: # %vector.body -; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-ASM-NEXT: addi a4, a1, -128 -; CHECK-ASM-NEXT: vlse32.v v8, (a4), a3 -; CHECK-ASM-NEXT: vlse32.v v9, (a1), a3 -; CHECK-ASM-NEXT: vle32.v v10, (a0) -; CHECK-ASM-NEXT: addi a4, a0, 32 -; CHECK-ASM-NEXT: vle32.v v11, (a4) -; CHECK-ASM-NEXT: vadd.vv v8, v10, v8 -; CHECK-ASM-NEXT: vadd.vv v9, v11, v9 -; CHECK-ASM-NEXT: vse32.v v8, (a0) -; CHECK-ASM-NEXT: vse32.v v9, (a4) -; CHECK-ASM-NEXT: addi a2, a2, -16 -; CHECK-ASM-NEXT: addi a0, a0, 64 -; CHECK-ASM-NEXT: addi a1, a1, 256 -; CHECK-ASM-NEXT: bnez a2, .LBB8_1 -; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup -; CHECK-ASM-NEXT: ret entry: br label %vector.body @@ -650,6 +469,7 @@ for.cond.cleanup: ; preds = %vector.body ; } ;} define void @gather_unroll(i32* noalias nocapture %A, i32* noalias nocapture readonly %B) { +; ; CHECK-LABEL: @gather_unroll( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -701,42 +521,6 @@ define void @gather_unroll(i32* noalias nocapture %A, i32* noalias nocapture rea ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; -; CHECK-ASM-LABEL: gather_unroll: -; CHECK-ASM: # %bb.0: # %entry -; CHECK-ASM-NEXT: li a2, 256 -; CHECK-ASM-NEXT: li a3, 64 -; CHECK-ASM-NEXT: li a4, 16 -; CHECK-ASM-NEXT: vsetivli zero, 8, e32, m1, ta, mu -; CHECK-ASM-NEXT: .LBB9_1: # %vector.body -; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-ASM-NEXT: vlse32.v v8, (a1), a3 -; CHECK-ASM-NEXT: vlse32.v v9, (a0), a4 -; CHECK-ASM-NEXT: vadd.vv v8, v9, v8 -; CHECK-ASM-NEXT: vsse32.v v8, (a0), a4 -; CHECK-ASM-NEXT: addi a5, a1, 16 -; CHECK-ASM-NEXT: vlse32.v v8, (a5), a3 -; CHECK-ASM-NEXT: addi a5, a0, 4 -; CHECK-ASM-NEXT: vlse32.v v9, (a5), a4 -; CHECK-ASM-NEXT: vadd.vv v8, v9, v8 -; CHECK-ASM-NEXT: vsse32.v v8, (a5), a4 -; CHECK-ASM-NEXT: addi a5, a1, 32 -; CHECK-ASM-NEXT: vlse32.v v8, (a5), a3 -; CHECK-ASM-NEXT: addi a5, a0, 8 -; CHECK-ASM-NEXT: vlse32.v v9, (a5), a4 -; CHECK-ASM-NEXT: vadd.vv v8, v9, v8 -; CHECK-ASM-NEXT: vsse32.v v8, (a5), a4 -; CHECK-ASM-NEXT: addi a5, a1, 48 -; CHECK-ASM-NEXT: vlse32.v v8, (a5), a3 -; CHECK-ASM-NEXT: addi a5, a0, 12 -; CHECK-ASM-NEXT: vlse32.v v9, (a5), a4 -; CHECK-ASM-NEXT: vadd.vv v8, v9, v8 -; CHECK-ASM-NEXT: vsse32.v v8, (a5), a4 -; CHECK-ASM-NEXT: addi a2, a2, -8 -; CHECK-ASM-NEXT: addi a1, a1, 512 -; CHECK-ASM-NEXT: addi a0, a0, 128 -; CHECK-ASM-NEXT: bnez a2, .LBB9_1 -; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup -; CHECK-ASM-NEXT: ret entry: br label %vector.body @@ -790,6 +574,7 @@ declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32>, <8 x i32*>, i32 immar ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers. define void @gather_of_pointers(i32** noalias nocapture %0, i32** noalias nocapture readonly %1) { +; ; CHECK-LABEL: @gather_of_pointers( ; CHECK-NEXT: br label [[TMP3:%.*]] ; CHECK: 3: @@ -814,24 +599,6 @@ define void @gather_of_pointers(i32** noalias nocapture %0, i32** noalias nocapt ; CHECK: 15: ; CHECK-NEXT: ret void ; -; CHECK-ASM-LABEL: gather_of_pointers: -; CHECK-ASM: # %bb.0: -; CHECK-ASM-NEXT: li a2, 1024 -; CHECK-ASM-NEXT: li a3, 40 -; CHECK-ASM-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-ASM-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1 -; CHECK-ASM-NEXT: vlse64.v v8, (a1), a3 -; CHECK-ASM-NEXT: addi a4, a1, 80 -; CHECK-ASM-NEXT: vlse64.v v9, (a4), a3 -; CHECK-ASM-NEXT: vse64.v v8, (a0) -; CHECK-ASM-NEXT: addi a4, a0, 16 -; CHECK-ASM-NEXT: vse64.v v9, (a4) -; CHECK-ASM-NEXT: addi a2, a2, -4 -; CHECK-ASM-NEXT: addi a0, a0, 32 -; CHECK-ASM-NEXT: addi a1, a1, 160 -; CHECK-ASM-NEXT: bnez a2, .LBB10_1 -; CHECK-ASM-NEXT: # %bb.2: -; CHECK-ASM-NEXT: ret br label %3 3: ; preds = %3, %2 @@ -863,6 +630,7 @@ declare <2 x i32*> @llvm.masked.gather.v2p0i32.v2p0p0i32(<2 x i32**>, i32 immarg ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers. define void @scatter_of_pointers(i32** noalias nocapture %0, i32** noalias nocapture readonly %1) { +; ; CHECK-LABEL: @scatter_of_pointers( ; CHECK-NEXT: br label [[TMP3:%.*]] ; CHECK: 3: @@ -887,24 +655,6 @@ define void @scatter_of_pointers(i32** noalias nocapture %0, i32** noalias nocap ; CHECK: 15: ; CHECK-NEXT: ret void ; -; CHECK-ASM-LABEL: scatter_of_pointers: -; CHECK-ASM: # %bb.0: -; CHECK-ASM-NEXT: li a2, 1024 -; CHECK-ASM-NEXT: li a3, 40 -; CHECK-ASM-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-ASM-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 -; CHECK-ASM-NEXT: vle64.v v8, (a1) -; CHECK-ASM-NEXT: addi a4, a1, 16 -; CHECK-ASM-NEXT: vle64.v v9, (a4) -; CHECK-ASM-NEXT: addi a4, a0, 80 -; CHECK-ASM-NEXT: vsse64.v v8, (a0), a3 -; CHECK-ASM-NEXT: vsse64.v v9, (a4), a3 -; CHECK-ASM-NEXT: addi a2, a2, -4 -; CHECK-ASM-NEXT: addi a1, a1, 32 -; CHECK-ASM-NEXT: addi a0, a0, 160 -; CHECK-ASM-NEXT: bnez a2, .LBB11_1 -; CHECK-ASM-NEXT: # %bb.2: -; CHECK-ASM-NEXT: ret br label %3 3: ; preds = %3, %2 @@ -935,6 +685,7 @@ define void @scatter_of_pointers(i32** noalias nocapture %0, i32** noalias nocap declare void @llvm.masked.scatter.v2p0i32.v2p0p0i32(<2 x i32*>, <2 x i32**>, i32 immarg, <2 x i1>) define void @strided_load_startval_add_with_splat(i8* noalias nocapture %0, i8* noalias nocapture readonly %1, i32 signext %2) { +; ; CHECK-LABEL: @strided_load_startval_add_with_splat( ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP2:%.*]], 1024 ; CHECK-NEXT: br i1 [[TMP4]], label [[TMP31:%.*]], label [[TMP5:%.*]] @@ -989,58 +740,6 @@ define void @strided_load_startval_add_with_splat(i8* noalias nocapture %0, i8* ; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i32 [[TMP41]], 1024 ; CHECK-NEXT: br i1 [[TMP42]], label [[TMP31]], label [[TMP32]] ; -; CHECK-ASM-LABEL: strided_load_startval_add_with_splat: -; CHECK-ASM: # %bb.0: -; CHECK-ASM-NEXT: li a3, 1024 -; CHECK-ASM-NEXT: beq a2, a3, .LBB12_7 -; CHECK-ASM-NEXT: # %bb.1: -; CHECK-ASM-NEXT: li a3, 1023 -; CHECK-ASM-NEXT: subw a4, a3, a2 -; CHECK-ASM-NEXT: li a5, 31 -; CHECK-ASM-NEXT: mv a3, a2 -; CHECK-ASM-NEXT: bltu a4, a5, .LBB12_5 -; CHECK-ASM-NEXT: # %bb.2: -; CHECK-ASM-NEXT: slli a3, a4, 32 -; CHECK-ASM-NEXT: srli a3, a3, 32 -; CHECK-ASM-NEXT: addi a4, a3, 1 -; CHECK-ASM-NEXT: andi a5, a4, -32 -; CHECK-ASM-NEXT: add a3, a5, a2 -; CHECK-ASM-NEXT: slli a6, a2, 2 -; CHECK-ASM-NEXT: add a6, a6, a2 -; CHECK-ASM-NEXT: add a2, a0, a2 -; CHECK-ASM-NEXT: add a6, a1, a6 -; CHECK-ASM-NEXT: li a7, 32 -; CHECK-ASM-NEXT: li t0, 5 -; CHECK-ASM-NEXT: mv t1, a5 -; CHECK-ASM-NEXT: .LBB12_3: # =>This Inner Loop Header: Depth=1 -; CHECK-ASM-NEXT: vsetvli zero, a7, e8, m1, ta, mu -; CHECK-ASM-NEXT: vlse8.v v8, (a6), t0 -; CHECK-ASM-NEXT: vle8.v v9, (a2) -; CHECK-ASM-NEXT: vadd.vv v8, v9, v8 -; CHECK-ASM-NEXT: vse8.v v8, (a2) -; CHECK-ASM-NEXT: addi t1, t1, -32 -; CHECK-ASM-NEXT: addi a2, a2, 32 -; CHECK-ASM-NEXT: addi a6, a6, 160 -; CHECK-ASM-NEXT: bnez t1, .LBB12_3 -; CHECK-ASM-NEXT: # %bb.4: -; CHECK-ASM-NEXT: beq a4, a5, .LBB12_7 -; CHECK-ASM-NEXT: .LBB12_5: -; CHECK-ASM-NEXT: slli a2, a3, 2 -; CHECK-ASM-NEXT: add a2, a2, a3 -; CHECK-ASM-NEXT: add a1, a1, a2 -; CHECK-ASM-NEXT: li a2, 1024 -; CHECK-ASM-NEXT: .LBB12_6: # =>This Inner Loop Header: Depth=1 -; CHECK-ASM-NEXT: lb a4, 0(a1) -; CHECK-ASM-NEXT: add a5, a0, a3 -; CHECK-ASM-NEXT: lb a6, 0(a5) -; CHECK-ASM-NEXT: addw a4, a6, a4 -; CHECK-ASM-NEXT: sb a4, 0(a5) -; CHECK-ASM-NEXT: addiw a4, a3, 1 -; CHECK-ASM-NEXT: addi a3, a3, 1 -; CHECK-ASM-NEXT: addi a1, a1, 5 -; CHECK-ASM-NEXT: bne a4, a2, .LBB12_6 -; CHECK-ASM-NEXT: .LBB12_7: -; CHECK-ASM-NEXT: ret %4 = icmp eq i32 %2, 1024 br i1 %4, label %36, label %5 -- 2.7.4