From 4f4f49137571ee0a5d87ec184b313a4e75c35fda Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Mon, 24 Jul 2023 12:18:04 -0700 Subject: [PATCH] [RISCV] Add memcpy lowering test coverage with and without V --- llvm/test/CodeGen/RISCV/memcpy-inline.ll | 1185 +++++++++++++----- llvm/test/CodeGen/RISCV/memcpy.ll | 448 +++++++ llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll | 1678 ++++++++++++++++++++++++++ 3 files changed, 3007 insertions(+), 304 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/memcpy.ll create mode 100644 llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll diff --git a/llvm/test/CodeGen/RISCV/memcpy-inline.ll b/llvm/test/CodeGen/RISCV/memcpy-inline.ll index 932bd2e..833e073 100644 --- a/llvm/test/CodeGen/RISCV/memcpy-inline.ll +++ b/llvm/test/CodeGen/RISCV/memcpy-inline.ll @@ -7,406 +7,935 @@ ; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST ; RUN: llc < %s -mtriple=riscv64 -mattr=+unaligned-scalar-mem \ ; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST -%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 } -@src = external dso_local global %struct.x -@dst = external dso_local global %struct.x +; ---------------------------------------------------------------------- +; Fully unaligned cases -@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1 -@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1 -@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1 -@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR \00", align 1 -@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1 -@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1 -@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16 +define void @unaligned_memcpy0(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: unaligned_memcpy0: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: unaligned_memcpy0: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 0, i1 false) + ret void +} + +define void @unaligned_memcpy1(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: unaligned_memcpy1: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lbu a1, 0(a1) +; RV32-BOTH-NEXT: sb a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: unaligned_memcpy1: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lbu a1, 0(a1) +; RV64-BOTH-NEXT: sb a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 1, i1 false) + ret void +} -define i32 @t0() { -; RV32-LABEL: t0: +define void @unaligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy2: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a0, %hi(src) -; RV32-NEXT: lw a1, %lo(src)(a0) -; RV32-NEXT: lui a2, %hi(dst) -; RV32-NEXT: sw a1, %lo(dst)(a2) -; RV32-NEXT: addi a0, a0, %lo(src) -; RV32-NEXT: lbu a1, 10(a0) -; RV32-NEXT: lh a3, 8(a0) -; RV32-NEXT: lw a0, 4(a0) -; RV32-NEXT: addi a2, a2, %lo(dst) -; RV32-NEXT: sb a1, 10(a2) -; RV32-NEXT: sh a3, 8(a2) -; RV32-NEXT: sw a0, 4(a2) -; RV32-NEXT: li a0, 0 +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) ; RV32-NEXT: ret ; -; RV64-LABEL: t0: +; RV64-LABEL: unaligned_memcpy2: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a0, %hi(src) -; RV64-NEXT: ld a1, %lo(src)(a0) -; RV64-NEXT: lui a2, %hi(dst) -; RV64-NEXT: addi a0, a0, %lo(src) -; RV64-NEXT: lbu a3, 10(a0) -; RV64-NEXT: lh a0, 8(a0) -; RV64-NEXT: sd a1, %lo(dst)(a2) -; RV64-NEXT: addi a1, a2, %lo(dst) -; RV64-NEXT: sb a3, 10(a1) -; RV64-NEXT: sh a0, 8(a1) -; RV64-NEXT: li a0, 0 +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) ; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t0: +; RV32-FAST-LABEL: unaligned_memcpy2: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a0, %hi(src) -; RV32-FAST-NEXT: lw a1, %lo(src)(a0) -; RV32-FAST-NEXT: lui a2, %hi(dst) -; RV32-FAST-NEXT: addi a0, a0, %lo(src) -; RV32-FAST-NEXT: lw a3, 7(a0) -; RV32-FAST-NEXT: lw a0, 4(a0) -; RV32-FAST-NEXT: sw a1, %lo(dst)(a2) -; RV32-FAST-NEXT: addi a1, a2, %lo(dst) -; RV32-FAST-NEXT: sw a3, 7(a1) -; RV32-FAST-NEXT: sw a0, 4(a1) -; RV32-FAST-NEXT: li a0, 0 +; RV32-FAST-NEXT: lh a1, 0(a1) +; RV32-FAST-NEXT: sh a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t0: +; RV64-FAST-LABEL: unaligned_memcpy2: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a0, %hi(src) -; RV64-FAST-NEXT: ld a1, %lo(src)(a0) -; RV64-FAST-NEXT: addi a0, a0, %lo(src) -; RV64-FAST-NEXT: lw a0, 7(a0) -; RV64-FAST-NEXT: lui a2, %hi(dst) -; RV64-FAST-NEXT: sd a1, %lo(dst)(a2) -; RV64-FAST-NEXT: addi a1, a2, %lo(dst) -; RV64-FAST-NEXT: sw a0, 7(a1) -; RV64-FAST-NEXT: li a0, 0 +; RV64-FAST-NEXT: lh a1, 0(a1) +; RV64-FAST-NEXT: sh a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - call void @llvm.memcpy.p0.p0.i32(ptr align 8 @dst, ptr align 8 @src, i32 11, i1 false) - ret i32 0 + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false) + ret void +} + +define void @unaligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy3: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy3: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy3: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lbu a2, 2(a1) +; RV32-FAST-NEXT: sb a2, 2(a0) +; RV32-FAST-NEXT: lh a1, 0(a1) +; RV32-FAST-NEXT: sh a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy3: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lbu a2, 2(a1) +; RV64-FAST-NEXT: sb a2, 2(a0) +; RV64-FAST-NEXT: lh a1, 0(a1) +; RV64-FAST-NEXT: sh a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false) + ret void } -define void @t1(ptr nocapture %C) nounwind { -; RV32-LABEL: t1: +define void @unaligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy4: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a1, %hi(.L.str1) -; RV32-NEXT: addi a1, a1, %lo(.L.str1) -; RV32-NEXT: li a2, 31 -; RV32-NEXT: tail memcpy@plt +; RV32-NEXT: lbu a2, 3(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret ; -; RV64-LABEL: t1: +; RV64-LABEL: unaligned_memcpy4: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a1, %hi(.L.str1) -; RV64-NEXT: addi a1, a1, %lo(.L.str1) -; RV64-NEXT: li a2, 31 -; RV64-NEXT: tail memcpy@plt +; RV64-NEXT: lbu a2, 3(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t1: +; RV32-FAST-LABEL: unaligned_memcpy4: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a1, 1141 -; RV32-FAST-NEXT: addi a1, a1, -439 -; RV32-FAST-NEXT: sw a1, 27(a0) -; RV32-FAST-NEXT: lui a1, 300325 -; RV32-FAST-NEXT: addi a1, a1, 1107 -; RV32-FAST-NEXT: sw a1, 24(a0) -; RV32-FAST-NEXT: lui a1, 132181 -; RV32-FAST-NEXT: addi a1, a1, -689 -; RV32-FAST-NEXT: sw a1, 20(a0) -; RV32-FAST-NEXT: lui a1, 340483 -; RV32-FAST-NEXT: addi a1, a1, -947 -; RV32-FAST-NEXT: sw a1, 16(a0) -; RV32-FAST-NEXT: lui a1, 267556 -; RV32-FAST-NEXT: addi a1, a1, 1871 -; RV32-FAST-NEXT: sw a1, 12(a0) -; RV32-FAST-NEXT: lui a1, 337154 -; RV32-FAST-NEXT: addi a1, a1, 69 -; RV32-FAST-NEXT: sw a1, 8(a0) -; RV32-FAST-NEXT: lui a1, 320757 -; RV32-FAST-NEXT: addi a1, a1, 1107 -; RV32-FAST-NEXT: sw a1, 4(a0) -; RV32-FAST-NEXT: lui a1, 365861 -; RV32-FAST-NEXT: addi a1, a1, -1980 +; RV32-FAST-NEXT: lw a1, 0(a1) ; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t1: +; RV64-FAST-LABEL: unaligned_memcpy4: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, %hi(.L.str1) -; RV64-FAST-NEXT: ld a2, %lo(.L.str1)(a1) -; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str1) -; RV64-FAST-NEXT: ld a3, 23(a1) -; RV64-FAST-NEXT: ld a4, 16(a1) -; RV64-FAST-NEXT: ld a1, 8(a1) -; RV64-FAST-NEXT: sd a2, 0(a0) -; RV64-FAST-NEXT: sd a3, 23(a0) -; RV64-FAST-NEXT: sd a4, 16(a0) -; RV64-FAST-NEXT: sd a1, 8(a0) +; RV64-FAST-NEXT: lw a1, 0(a1) +; RV64-FAST-NEXT: sw a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str1, i64 31, i1 false) + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 4, i1 false) ret void } -define void @t2(ptr nocapture %C) nounwind { -; RV32-BOTH-LABEL: t2: -; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: lui a1, %hi(.L.str2) -; RV32-BOTH-NEXT: addi a1, a1, %lo(.L.str2) -; RV32-BOTH-NEXT: li a2, 36 -; RV32-BOTH-NEXT: tail memcpy@plt +define void @unaligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy7: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: lbu a2, 5(a1) +; RV32-NEXT: sb a2, 5(a0) +; RV32-NEXT: lbu a2, 4(a1) +; RV32-NEXT: sb a2, 4(a0) +; RV32-NEXT: lbu a2, 3(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy7: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: lbu a2, 5(a1) +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: lbu a2, 4(a1) +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: lbu a2, 3(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy7: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 3(a1) +; RV32-FAST-NEXT: sw a2, 3(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy7: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lw a2, 3(a1) +; RV64-FAST-NEXT: sw a2, 3(a0) +; RV64-FAST-NEXT: lw a1, 0(a1) +; RV64-FAST-NEXT: sw a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 7, i1 false) + ret void +} + +define void @unaligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy8: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 7(a1) +; RV32-NEXT: sb a2, 7(a0) +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: lbu a2, 5(a1) +; RV32-NEXT: sb a2, 5(a0) +; RV32-NEXT: lbu a2, 4(a1) +; RV32-NEXT: sb a2, 4(a0) +; RV32-NEXT: lbu a2, 3(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret ; -; RV64-LABEL: t2: +; RV64-LABEL: unaligned_memcpy8: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a1, %hi(.L.str2) -; RV64-NEXT: addi a1, a1, %lo(.L.str2) -; RV64-NEXT: li a2, 36 -; RV64-NEXT: tail memcpy@plt +; RV64-NEXT: lbu a2, 7(a1) +; RV64-NEXT: sb a2, 7(a0) +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: lbu a2, 5(a1) +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: lbu a2, 4(a1) +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: lbu a2, 3(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy8: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t2: +; RV64-FAST-LABEL: unaligned_memcpy8: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, %hi(.L.str2) -; RV64-FAST-NEXT: ld a2, %lo(.L.str2)(a1) -; RV64-FAST-NEXT: sd a2, 0(a0) -; RV64-FAST-NEXT: lui a2, 1156 -; RV64-FAST-NEXT: addiw a2, a2, 332 -; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str2) -; RV64-FAST-NEXT: ld a3, 24(a1) -; RV64-FAST-NEXT: ld a4, 16(a1) -; RV64-FAST-NEXT: ld a1, 8(a1) -; RV64-FAST-NEXT: sw a2, 32(a0) -; RV64-FAST-NEXT: sd a3, 24(a0) -; RV64-FAST-NEXT: sd a4, 16(a0) -; RV64-FAST-NEXT: sd a1, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str2, i64 36, i1 false) + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 8, i1 false) ret void } -define void @t3(ptr nocapture %C) nounwind { -; RV32-LABEL: t3: +define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy15: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a1, %hi(.L.str3) -; RV32-NEXT: addi a1, a1, %lo(.L.str3) -; RV32-NEXT: li a2, 24 -; RV32-NEXT: tail memcpy@plt +; RV32-NEXT: lbu a2, 14(a1) +; RV32-NEXT: sb a2, 14(a0) +; RV32-NEXT: lbu a2, 13(a1) +; RV32-NEXT: sb a2, 13(a0) +; RV32-NEXT: lbu a2, 12(a1) +; RV32-NEXT: sb a2, 12(a0) +; RV32-NEXT: lbu a2, 11(a1) +; RV32-NEXT: sb a2, 11(a0) +; RV32-NEXT: lbu a2, 10(a1) +; RV32-NEXT: sb a2, 10(a0) +; RV32-NEXT: lbu a2, 9(a1) +; RV32-NEXT: sb a2, 9(a0) +; RV32-NEXT: lbu a2, 8(a1) +; RV32-NEXT: sb a2, 8(a0) +; RV32-NEXT: lbu a2, 7(a1) +; RV32-NEXT: sb a2, 7(a0) +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: lbu a2, 5(a1) +; RV32-NEXT: sb a2, 5(a0) +; RV32-NEXT: lbu a2, 4(a1) +; RV32-NEXT: sb a2, 4(a0) +; RV32-NEXT: lbu a2, 3(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret ; -; RV64-LABEL: t3: +; RV64-LABEL: unaligned_memcpy15: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a1, %hi(.L.str3) -; RV64-NEXT: addi a1, a1, %lo(.L.str3) -; RV64-NEXT: li a2, 24 -; RV64-NEXT: tail memcpy@plt +; RV64-NEXT: lbu a2, 14(a1) +; RV64-NEXT: sb a2, 14(a0) +; RV64-NEXT: lbu a2, 13(a1) +; RV64-NEXT: sb a2, 13(a0) +; RV64-NEXT: lbu a2, 12(a1) +; RV64-NEXT: sb a2, 12(a0) +; RV64-NEXT: lbu a2, 11(a1) +; RV64-NEXT: sb a2, 11(a0) +; RV64-NEXT: lbu a2, 10(a1) +; RV64-NEXT: sb a2, 10(a0) +; RV64-NEXT: lbu a2, 9(a1) +; RV64-NEXT: sb a2, 9(a0) +; RV64-NEXT: lbu a2, 8(a1) +; RV64-NEXT: sb a2, 8(a0) +; RV64-NEXT: lbu a2, 7(a1) +; RV64-NEXT: sb a2, 7(a0) +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: lbu a2, 5(a1) +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: lbu a2, 4(a1) +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: lbu a2, 3(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t3: +; RV32-FAST-LABEL: unaligned_memcpy15: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a1, 1109 -; RV32-FAST-NEXT: addi a1, a1, -689 -; RV32-FAST-NEXT: sw a1, 20(a0) -; RV32-FAST-NEXT: lui a1, 340483 -; RV32-FAST-NEXT: addi a1, a1, -947 -; RV32-FAST-NEXT: sw a1, 16(a0) -; RV32-FAST-NEXT: lui a1, 267556 -; RV32-FAST-NEXT: addi a1, a1, 1871 -; RV32-FAST-NEXT: sw a1, 12(a0) -; RV32-FAST-NEXT: lui a1, 337154 -; RV32-FAST-NEXT: addi a1, a1, 69 -; RV32-FAST-NEXT: sw a1, 8(a0) -; RV32-FAST-NEXT: lui a1, 320757 -; RV32-FAST-NEXT: addi a1, a1, 1107 -; RV32-FAST-NEXT: sw a1, 4(a0) -; RV32-FAST-NEXT: lui a1, 365861 -; RV32-FAST-NEXT: addi a1, a1, -1980 +; RV32-FAST-NEXT: lw a2, 11(a1) +; RV32-FAST-NEXT: sw a2, 11(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) ; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t3: +; RV64-FAST-LABEL: unaligned_memcpy15: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, %hi(.L.str3) -; RV64-FAST-NEXT: ld a2, %lo(.L.str3)(a1) -; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str3) -; RV64-FAST-NEXT: ld a3, 16(a1) -; RV64-FAST-NEXT: ld a1, 8(a1) -; RV64-FAST-NEXT: sd a2, 0(a0) -; RV64-FAST-NEXT: sd a3, 16(a0) -; RV64-FAST-NEXT: sd a1, 8(a0) +; RV64-FAST-NEXT: ld a2, 7(a1) +; RV64-FAST-NEXT: sd a2, 7(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str3, i64 24, i1 false) + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 15, i1 false) ret void } -define void @t4(ptr nocapture %C) nounwind { -; RV32-LABEL: t4: +define void @unaligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy16: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lui a1, %hi(.L.str4) -; RV32-NEXT: addi a1, a1, %lo(.L.str4) -; RV32-NEXT: li a2, 18 -; RV32-NEXT: tail memcpy@plt +; RV32-NEXT: lbu a2, 15(a1) +; RV32-NEXT: sb a2, 15(a0) +; RV32-NEXT: lbu a2, 14(a1) +; RV32-NEXT: sb a2, 14(a0) +; RV32-NEXT: lbu a2, 13(a1) +; RV32-NEXT: sb a2, 13(a0) +; RV32-NEXT: lbu a2, 12(a1) +; RV32-NEXT: sb a2, 12(a0) +; RV32-NEXT: lbu a2, 11(a1) +; RV32-NEXT: sb a2, 11(a0) +; RV32-NEXT: lbu a2, 10(a1) +; RV32-NEXT: sb a2, 10(a0) +; RV32-NEXT: lbu a2, 9(a1) +; RV32-NEXT: sb a2, 9(a0) +; RV32-NEXT: lbu a2, 8(a1) +; RV32-NEXT: sb a2, 8(a0) +; RV32-NEXT: lbu a2, 7(a1) +; RV32-NEXT: sb a2, 7(a0) +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: lbu a2, 5(a1) +; RV32-NEXT: sb a2, 5(a0) +; RV32-NEXT: lbu a2, 4(a1) +; RV32-NEXT: sb a2, 4(a0) +; RV32-NEXT: lbu a2, 3(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret ; -; RV64-LABEL: t4: +; RV64-LABEL: unaligned_memcpy16: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a1, %hi(.L.str4) -; RV64-NEXT: addi a1, a1, %lo(.L.str4) -; RV64-NEXT: li a2, 18 -; RV64-NEXT: tail memcpy@plt +; RV64-NEXT: lbu a2, 15(a1) +; RV64-NEXT: sb a2, 15(a0) +; RV64-NEXT: lbu a2, 14(a1) +; RV64-NEXT: sb a2, 14(a0) +; RV64-NEXT: lbu a2, 13(a1) +; RV64-NEXT: sb a2, 13(a0) +; RV64-NEXT: lbu a2, 12(a1) +; RV64-NEXT: sb a2, 12(a0) +; RV64-NEXT: lbu a2, 11(a1) +; RV64-NEXT: sb a2, 11(a0) +; RV64-NEXT: lbu a2, 10(a1) +; RV64-NEXT: sb a2, 10(a0) +; RV64-NEXT: lbu a2, 9(a1) +; RV64-NEXT: sb a2, 9(a0) +; RV64-NEXT: lbu a2, 8(a1) +; RV64-NEXT: sb a2, 8(a0) +; RV64-NEXT: lbu a2, 7(a1) +; RV64-NEXT: sb a2, 7(a0) +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: lbu a2, 5(a1) +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: lbu a2, 4(a1) +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: lbu a2, 3(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t4: +; RV32-FAST-LABEL: unaligned_memcpy16: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: li a1, 32 -; RV32-FAST-NEXT: sh a1, 16(a0) -; RV32-FAST-NEXT: lui a1, 132388 -; RV32-FAST-NEXT: addi a1, a1, 1871 -; RV32-FAST-NEXT: sw a1, 12(a0) -; RV32-FAST-NEXT: lui a1, 337154 -; RV32-FAST-NEXT: addi a1, a1, 69 -; RV32-FAST-NEXT: sw a1, 8(a0) -; RV32-FAST-NEXT: lui a1, 320757 -; RV32-FAST-NEXT: addi a1, a1, 1107 -; RV32-FAST-NEXT: sw a1, 4(a0) -; RV32-FAST-NEXT: lui a1, 365861 -; RV32-FAST-NEXT: addi a1, a1, -1980 +; RV32-FAST-NEXT: lw a2, 12(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) ; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t4: +; RV64-FAST-LABEL: unaligned_memcpy16: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, %hi(.L.str4) -; RV64-FAST-NEXT: ld a2, %lo(.L.str4)(a1) -; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str4) -; RV64-FAST-NEXT: ld a1, 8(a1) -; RV64-FAST-NEXT: li a3, 32 -; RV64-FAST-NEXT: sh a3, 16(a0) -; RV64-FAST-NEXT: sd a2, 0(a0) -; RV64-FAST-NEXT: sd a1, 8(a0) +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str4, i64 18, i1 false) + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 16, i1 false) ret void } -define void @t5(ptr nocapture %C) nounwind { -; RV32-LABEL: t5: +define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy31: ; RV32: # %bb.0: # %entry -; RV32-NEXT: sb zero, 6(a0) -; RV32-NEXT: li a1, 84 -; RV32-NEXT: sb a1, 5(a0) -; RV32-NEXT: li a1, 83 -; RV32-NEXT: sb a1, 4(a0) -; RV32-NEXT: li a1, 89 -; RV32-NEXT: sb a1, 3(a0) -; RV32-NEXT: li a1, 82 -; RV32-NEXT: sb a1, 2(a0) -; RV32-NEXT: li a1, 72 -; RV32-NEXT: sb a1, 1(a0) -; RV32-NEXT: li a1, 68 +; RV32-NEXT: lbu a2, 30(a1) +; RV32-NEXT: sb a2, 30(a0) +; RV32-NEXT: lbu a2, 29(a1) +; RV32-NEXT: sb a2, 29(a0) +; RV32-NEXT: lbu a2, 28(a1) +; RV32-NEXT: sb a2, 28(a0) +; RV32-NEXT: lbu a2, 27(a1) +; RV32-NEXT: sb a2, 27(a0) +; RV32-NEXT: lbu a2, 26(a1) +; RV32-NEXT: sb a2, 26(a0) +; RV32-NEXT: lbu a2, 25(a1) +; RV32-NEXT: sb a2, 25(a0) +; RV32-NEXT: lbu a2, 24(a1) +; RV32-NEXT: sb a2, 24(a0) +; RV32-NEXT: lbu a2, 23(a1) +; RV32-NEXT: sb a2, 23(a0) +; RV32-NEXT: lbu a2, 22(a1) +; RV32-NEXT: sb a2, 22(a0) +; RV32-NEXT: lbu a2, 21(a1) +; RV32-NEXT: sb a2, 21(a0) +; RV32-NEXT: lbu a2, 20(a1) +; RV32-NEXT: sb a2, 20(a0) +; RV32-NEXT: lbu a2, 19(a1) +; RV32-NEXT: sb a2, 19(a0) +; RV32-NEXT: lbu a2, 18(a1) +; RV32-NEXT: sb a2, 18(a0) +; RV32-NEXT: lbu a2, 17(a1) +; RV32-NEXT: sb a2, 17(a0) +; RV32-NEXT: lbu a2, 16(a1) +; RV32-NEXT: sb a2, 16(a0) +; RV32-NEXT: lbu a2, 15(a1) +; RV32-NEXT: sb a2, 15(a0) +; RV32-NEXT: lbu a2, 14(a1) +; RV32-NEXT: sb a2, 14(a0) +; RV32-NEXT: lbu a2, 13(a1) +; RV32-NEXT: sb a2, 13(a0) +; RV32-NEXT: lbu a2, 12(a1) +; RV32-NEXT: sb a2, 12(a0) +; RV32-NEXT: lbu a2, 11(a1) +; RV32-NEXT: sb a2, 11(a0) +; RV32-NEXT: lbu a2, 10(a1) +; RV32-NEXT: sb a2, 10(a0) +; RV32-NEXT: lbu a2, 9(a1) +; RV32-NEXT: sb a2, 9(a0) +; RV32-NEXT: lbu a2, 8(a1) +; RV32-NEXT: sb a2, 8(a0) +; RV32-NEXT: lbu a2, 7(a1) +; RV32-NEXT: sb a2, 7(a0) +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: lbu a2, 5(a1) +; RV32-NEXT: sb a2, 5(a0) +; RV32-NEXT: lbu a2, 4(a1) +; RV32-NEXT: sb a2, 4(a0) +; RV32-NEXT: lbu a2, 3(a1) +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: lbu a2, 1(a1) +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: lbu a1, 0(a1) ; RV32-NEXT: sb a1, 0(a0) ; RV32-NEXT: ret ; -; RV64-LABEL: t5: +; RV64-LABEL: unaligned_memcpy31: ; RV64: # %bb.0: # %entry -; RV64-NEXT: sb zero, 6(a0) -; RV64-NEXT: li a1, 84 -; RV64-NEXT: sb a1, 5(a0) -; RV64-NEXT: li a1, 83 -; RV64-NEXT: sb a1, 4(a0) -; RV64-NEXT: li a1, 89 -; RV64-NEXT: sb a1, 3(a0) -; RV64-NEXT: li a1, 82 -; RV64-NEXT: sb a1, 2(a0) -; RV64-NEXT: li a1, 72 -; RV64-NEXT: sb a1, 1(a0) -; RV64-NEXT: li a1, 68 +; RV64-NEXT: lbu a2, 30(a1) +; RV64-NEXT: sb a2, 30(a0) +; RV64-NEXT: lbu a2, 29(a1) +; RV64-NEXT: sb a2, 29(a0) +; RV64-NEXT: lbu a2, 28(a1) +; RV64-NEXT: sb a2, 28(a0) +; RV64-NEXT: lbu a2, 27(a1) +; RV64-NEXT: sb a2, 27(a0) +; RV64-NEXT: lbu a2, 26(a1) +; RV64-NEXT: sb a2, 26(a0) +; RV64-NEXT: lbu a2, 25(a1) +; RV64-NEXT: sb a2, 25(a0) +; RV64-NEXT: lbu a2, 24(a1) +; RV64-NEXT: sb a2, 24(a0) +; RV64-NEXT: lbu a2, 23(a1) +; RV64-NEXT: sb a2, 23(a0) +; RV64-NEXT: lbu a2, 22(a1) +; RV64-NEXT: sb a2, 22(a0) +; RV64-NEXT: lbu a2, 21(a1) +; RV64-NEXT: sb a2, 21(a0) +; RV64-NEXT: lbu a2, 20(a1) +; RV64-NEXT: sb a2, 20(a0) +; RV64-NEXT: lbu a2, 19(a1) +; RV64-NEXT: sb a2, 19(a0) +; RV64-NEXT: lbu a2, 18(a1) +; RV64-NEXT: sb a2, 18(a0) +; RV64-NEXT: lbu a2, 17(a1) +; RV64-NEXT: sb a2, 17(a0) +; RV64-NEXT: lbu a2, 16(a1) +; RV64-NEXT: sb a2, 16(a0) +; RV64-NEXT: lbu a2, 15(a1) +; RV64-NEXT: sb a2, 15(a0) +; RV64-NEXT: lbu a2, 14(a1) +; RV64-NEXT: sb a2, 14(a0) +; RV64-NEXT: lbu a2, 13(a1) +; RV64-NEXT: sb a2, 13(a0) +; RV64-NEXT: lbu a2, 12(a1) +; RV64-NEXT: sb a2, 12(a0) +; RV64-NEXT: lbu a2, 11(a1) +; RV64-NEXT: sb a2, 11(a0) +; RV64-NEXT: lbu a2, 10(a1) +; RV64-NEXT: sb a2, 10(a0) +; RV64-NEXT: lbu a2, 9(a1) +; RV64-NEXT: sb a2, 9(a0) +; RV64-NEXT: lbu a2, 8(a1) +; RV64-NEXT: sb a2, 8(a0) +; RV64-NEXT: lbu a2, 7(a1) +; RV64-NEXT: sb a2, 7(a0) +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: lbu a2, 5(a1) +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: lbu a2, 4(a1) +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: lbu a2, 3(a1) +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: lbu a2, 1(a1) +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: lbu a1, 0(a1) ; RV64-NEXT: sb a1, 0(a0) ; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t5: +; RV32-FAST-LABEL: unaligned_memcpy31: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a1, 1349 -; RV32-FAST-NEXT: addi a1, a1, 857 -; RV32-FAST-NEXT: sw a1, 3(a0) -; RV32-FAST-NEXT: lui a1, 365861 -; RV32-FAST-NEXT: addi a1, a1, -1980 +; RV32-FAST-NEXT: lw a2, 27(a1) +; RV32-FAST-NEXT: sw a2, 27(a0) +; RV32-FAST-NEXT: lw a2, 24(a1) +; RV32-FAST-NEXT: sw a2, 24(a0) +; RV32-FAST-NEXT: lw a2, 20(a1) +; RV32-FAST-NEXT: sw a2, 20(a0) +; RV32-FAST-NEXT: lw a2, 16(a1) +; RV32-FAST-NEXT: sw a2, 16(a0) +; RV32-FAST-NEXT: lw a2, 12(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) ; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t5: +; RV64-FAST-LABEL: unaligned_memcpy31: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a1, 1349 -; RV64-FAST-NEXT: addiw a1, a1, 857 -; RV64-FAST-NEXT: sw a1, 3(a0) -; RV64-FAST-NEXT: lui a1, 365861 -; RV64-FAST-NEXT: addiw a1, a1, -1980 +; RV64-FAST-NEXT: ld a2, 23(a1) +; RV64-FAST-NEXT: sd a2, 23(a0) +; RV64-FAST-NEXT: ld a2, 16(a1) +; RV64-FAST-NEXT: sd a2, 16(a0) +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 31, i1 false) + ret void +} + +; ---------------------------------------------------------------------- +; Fully aligned cases + +define void @aligned_memcpy0(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy0: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy0: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 0, i1 false) + ret void +} + +define void @aligned_memcpy1(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy1: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lbu a1, 0(a1) +; RV32-BOTH-NEXT: sb a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy1: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lbu a1, 0(a1) +; RV64-BOTH-NEXT: sb a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 1, i1 false) + ret void +} + +define void @aligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy2: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lh a1, 0(a1) +; RV32-BOTH-NEXT: sh a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy2: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lh a1, 0(a1) +; RV64-BOTH-NEXT: sh a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 2, i1 false) + ret void +} + +define void @aligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy3: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lbu a2, 2(a1) +; RV32-BOTH-NEXT: sb a2, 2(a0) +; RV32-BOTH-NEXT: lh a1, 0(a1) +; RV32-BOTH-NEXT: sh a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy3: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lbu a2, 2(a1) +; RV64-BOTH-NEXT: sb a2, 2(a0) +; RV64-BOTH-NEXT: lh a1, 0(a1) +; RV64-BOTH-NEXT: sh a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 3, i1 false) + ret void +} + +define void @aligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy4: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lw a1, 0(a1) +; RV32-BOTH-NEXT: sw a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy4: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lw a1, 0(a1) +; RV64-BOTH-NEXT: sw a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 4, i1 false) + ret void +} + +define void @aligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: aligned_memcpy7: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: lh a2, 4(a1) +; RV32-NEXT: sh a2, 4(a0) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: aligned_memcpy7: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: lh a2, 4(a1) +; RV64-NEXT: sh a2, 4(a0) +; RV64-NEXT: lw a1, 0(a1) +; RV64-NEXT: sw a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: aligned_memcpy7: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 3(a1) +; RV32-FAST-NEXT: sw a2, 3(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: aligned_memcpy7: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lw a2, 3(a1) +; RV64-FAST-NEXT: sw a2, 3(a0) +; RV64-FAST-NEXT: lw a1, 0(a1) ; RV64-FAST-NEXT: sw a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str5, i64 7, i1 false) + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 7, i1 false) + ret void +} + +define void @aligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy8: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lw a2, 4(a1) +; RV32-BOTH-NEXT: sw a2, 4(a0) +; RV32-BOTH-NEXT: lw a1, 0(a1) +; RV32-BOTH-NEXT: sw a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy8: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: ld a1, 0(a1) +; RV64-BOTH-NEXT: sd a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 8, i1 false) ret void } -define void @t6() nounwind { -; RV32-LABEL: t6: +define void @aligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: aligned_memcpy15: ; RV32: # %bb.0: # %entry -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: lui a0, %hi(spool.splbuf) -; RV32-NEXT: addi a0, a0, %lo(spool.splbuf) -; RV32-NEXT: lui a1, %hi(.L.str6) -; RV32-NEXT: addi a1, a1, %lo(.L.str6) -; RV32-NEXT: li a2, 14 -; RV32-NEXT: call memcpy@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: lbu a2, 14(a1) +; RV32-NEXT: sb a2, 14(a0) +; RV32-NEXT: lh a2, 12(a1) +; RV32-NEXT: sh a2, 12(a0) +; RV32-NEXT: lw a2, 8(a1) +; RV32-NEXT: sw a2, 8(a0) +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: sw a2, 4(a0) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sw a1, 0(a0) ; RV32-NEXT: ret ; -; RV64-LABEL: t6: +; RV64-LABEL: aligned_memcpy15: ; RV64: # %bb.0: # %entry -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: lui a0, %hi(spool.splbuf) -; RV64-NEXT: addi a0, a0, %lo(spool.splbuf) -; RV64-NEXT: lui a1, %hi(.L.str6) -; RV64-NEXT: addi a1, a1, %lo(.L.str6) -; RV64-NEXT: li a2, 14 -; RV64-NEXT: call memcpy@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lbu a2, 14(a1) +; RV64-NEXT: sb a2, 14(a0) +; RV64-NEXT: lh a2, 12(a1) +; RV64-NEXT: sh a2, 12(a0) +; RV64-NEXT: lw a2, 8(a1) +; RV64-NEXT: sw a2, 8(a0) +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: sd a1, 0(a0) ; RV64-NEXT: ret ; -; RV32-FAST-LABEL: t6: +; RV32-FAST-LABEL: aligned_memcpy15: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lui a0, %hi(spool.splbuf) -; RV32-FAST-NEXT: li a1, 88 -; RV32-FAST-NEXT: sh a1, %lo(spool.splbuf+12)(a0) -; RV32-FAST-NEXT: lui a1, 361862 -; RV32-FAST-NEXT: addi a1, a1, -1960 -; RV32-FAST-NEXT: sw a1, %lo(spool.splbuf+8)(a0) -; RV32-FAST-NEXT: lui a1, 362199 -; RV32-FAST-NEXT: addi a1, a1, 559 -; RV32-FAST-NEXT: sw a1, %lo(spool.splbuf+4)(a0) -; RV32-FAST-NEXT: lui a1, 460503 -; RV32-FAST-NEXT: addi a1, a1, 1071 -; RV32-FAST-NEXT: sw a1, %lo(spool.splbuf)(a0) +; RV32-FAST-NEXT: lw a2, 11(a1) +; RV32-FAST-NEXT: sw a2, 11(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) ; RV32-FAST-NEXT: ret ; -; RV64-FAST-LABEL: t6: +; RV64-FAST-LABEL: aligned_memcpy15: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lui a0, %hi(.L.str6) -; RV64-FAST-NEXT: ld a1, %lo(.L.str6)(a0) -; RV64-FAST-NEXT: addi a0, a0, %lo(.L.str6) -; RV64-FAST-NEXT: ld a0, 6(a0) -; RV64-FAST-NEXT: lui a2, %hi(spool.splbuf) -; RV64-FAST-NEXT: sd a1, %lo(spool.splbuf)(a2) -; RV64-FAST-NEXT: sd a0, %lo(spool.splbuf+6)(a2) +; RV64-FAST-NEXT: ld a2, 7(a1) +; RV64-FAST-NEXT: sd a2, 7(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - call void @llvm.memcpy.p0.p0.i64(ptr @spool.splbuf, ptr @.str6, i64 14, i1 false) + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 15, i1 false) ret void } -%struct.Foo = type { i32, i32, i32, i32 } +define void @aligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy16: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lw a2, 12(a1) +; RV32-BOTH-NEXT: sw a2, 12(a0) +; RV32-BOTH-NEXT: lw a2, 8(a1) +; RV32-BOTH-NEXT: sw a2, 8(a0) +; RV32-BOTH-NEXT: lw a2, 4(a1) +; RV32-BOTH-NEXT: sw a2, 4(a0) +; RV32-BOTH-NEXT: lw a1, 0(a1) +; RV32-BOTH-NEXT: sw a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy16: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: ld a2, 8(a1) +; RV64-BOTH-NEXT: sd a2, 8(a0) +; RV64-BOTH-NEXT: ld a1, 0(a1) +; RV64-BOTH-NEXT: sd a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 16, i1 false) + ret void +} -define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind { -; RV32-BOTH-LABEL: t7: +define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: aligned_memcpy31: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 30(a1) +; RV32-NEXT: sb a2, 30(a0) +; RV32-NEXT: lh a2, 28(a1) +; RV32-NEXT: sh a2, 28(a0) +; RV32-NEXT: lw a2, 24(a1) +; RV32-NEXT: sw a2, 24(a0) +; RV32-NEXT: lw a2, 20(a1) +; RV32-NEXT: sw a2, 20(a0) +; RV32-NEXT: lw a2, 16(a1) +; RV32-NEXT: sw a2, 16(a0) +; RV32-NEXT: lw a2, 12(a1) +; RV32-NEXT: sw a2, 12(a0) +; RV32-NEXT: lw a2, 8(a1) +; RV32-NEXT: sw a2, 8(a0) +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: sw a2, 4(a0) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: aligned_memcpy31: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 30(a1) +; RV64-NEXT: sb a2, 30(a0) +; RV64-NEXT: lh a2, 28(a1) +; RV64-NEXT: sh a2, 28(a0) +; RV64-NEXT: lw a2, 24(a1) +; RV64-NEXT: sw a2, 24(a0) +; RV64-NEXT: ld a2, 16(a1) +; RV64-NEXT: sd a2, 16(a0) +; RV64-NEXT: ld a2, 8(a1) +; RV64-NEXT: sd a2, 8(a0) +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: sd a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: aligned_memcpy31: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 27(a1) +; RV32-FAST-NEXT: sw a2, 27(a0) +; RV32-FAST-NEXT: lw a2, 24(a1) +; RV32-FAST-NEXT: sw a2, 24(a0) +; RV32-FAST-NEXT: lw a2, 20(a1) +; RV32-FAST-NEXT: sw a2, 20(a0) +; RV32-FAST-NEXT: lw a2, 16(a1) +; RV32-FAST-NEXT: sw a2, 16(a0) +; RV32-FAST-NEXT: lw a2, 12(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: aligned_memcpy31: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: ld a2, 23(a1) +; RV64-FAST-NEXT: sd a2, 23(a0) +; RV64-FAST-NEXT: ld a2, 16(a1) +; RV64-FAST-NEXT: sd a2, 16(a0) +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 31, i1 false) + ret void +} + +; ------------------------------------------------------------------------ +; A few partially aligned cases + + +define void @memcpy16_align4(ptr nocapture %dest, ptr nocapture %src) nounwind { +; RV32-BOTH-LABEL: memcpy16_align4: ; RV32-BOTH: # %bb.0: # %entry ; RV32-BOTH-NEXT: lw a2, 12(a1) ; RV32-BOTH-NEXT: sw a2, 12(a0) @@ -418,7 +947,7 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind { ; RV32-BOTH-NEXT: sw a1, 0(a0) ; RV32-BOTH-NEXT: ret ; -; RV64-LABEL: t7: +; RV64-LABEL: memcpy16_align4: ; RV64: # %bb.0: # %entry ; RV64-NEXT: lw a2, 12(a1) ; RV64-NEXT: sw a2, 12(a0) @@ -430,7 +959,7 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind { ; RV64-NEXT: sw a1, 0(a0) ; RV64-NEXT: ret ; -; RV64-FAST-LABEL: t7: +; RV64-FAST-LABEL: memcpy16_align4: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: ld a2, 8(a1) ; RV64-FAST-NEXT: sd a2, 8(a0) @@ -438,11 +967,59 @@ define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind { ; RV64-FAST-NEXT: sd a1, 0(a0) ; RV64-FAST-NEXT: ret entry: - tail call void @llvm.memcpy.p0.p0.i32(ptr align 4 %a, ptr align 4 %b, i32 16, i1 false) + tail call void @llvm.memcpy.inline.p0.p0.i32(ptr align 4 %dest, ptr align 4 %src, i32 16, i1 false) ret void } -declare void @llvm.memcpy.p0.p0.i32(ptr nocapture, ptr nocapture, i32, i1) nounwind -declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; RV64-BOTH: {{.*}} +define i32 @memcpy11_align8(ptr nocapture %dest, ptr %src) { +; RV32-LABEL: memcpy11_align8: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 10(a1) +; RV32-NEXT: sb a2, 10(a0) +; RV32-NEXT: lh a2, 8(a1) +; RV32-NEXT: sh a2, 8(a0) +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: sw a2, 4(a0) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret +; +; RV64-LABEL: memcpy11_align8: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 10(a1) +; RV64-NEXT: sb a2, 10(a0) +; RV64-NEXT: lh a2, 8(a1) +; RV64-NEXT: sh a2, 8(a0) +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: sd a1, 0(a0) +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: memcpy11_align8: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 7(a1) +; RV32-FAST-NEXT: sw a2, 7(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: li a0, 0 +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: memcpy11_align8: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lw a2, 7(a1) +; RV64-FAST-NEXT: sw a2, 7(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: li a0, 0 +; RV64-FAST-NEXT: ret +entry: + call void @llvm.memcpy.inline.p0.p0.i32(ptr align 8 %dest, ptr align 8 %src, i32 11, i1 false) + ret i32 0 +} + + +declare void @llvm.memcpy.inline.p0.p0.i32(ptr nocapture, ptr nocapture, i32, i1) nounwind +declare void @llvm.memcpy.inline.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind diff --git a/llvm/test/CodeGen/RISCV/memcpy.ll b/llvm/test/CodeGen/RISCV/memcpy.ll new file mode 100644 index 0000000..932bd2e --- /dev/null +++ b/llvm/test/CodeGen/RISCV/memcpy.ll @@ -0,0 +1,448 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 \ +; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32 +; RUN: llc < %s -mtriple=riscv64 \ +; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64 +; RUN: llc < %s -mtriple=riscv32 -mattr=+unaligned-scalar-mem \ +; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST +; RUN: llc < %s -mtriple=riscv64 -mattr=+unaligned-scalar-mem \ +; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST +%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 } + +@src = external dso_local global %struct.x +@dst = external dso_local global %struct.x + +@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1 +@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1 +@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1 +@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR \00", align 1 +@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1 +@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1 +@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16 + +define i32 @t0() { +; RV32-LABEL: t0: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lui a0, %hi(src) +; RV32-NEXT: lw a1, %lo(src)(a0) +; RV32-NEXT: lui a2, %hi(dst) +; RV32-NEXT: sw a1, %lo(dst)(a2) +; RV32-NEXT: addi a0, a0, %lo(src) +; RV32-NEXT: lbu a1, 10(a0) +; RV32-NEXT: lh a3, 8(a0) +; RV32-NEXT: lw a0, 4(a0) +; RV32-NEXT: addi a2, a2, %lo(dst) +; RV32-NEXT: sb a1, 10(a2) +; RV32-NEXT: sh a3, 8(a2) +; RV32-NEXT: sw a0, 4(a2) +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret +; +; RV64-LABEL: t0: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lui a0, %hi(src) +; RV64-NEXT: ld a1, %lo(src)(a0) +; RV64-NEXT: lui a2, %hi(dst) +; RV64-NEXT: addi a0, a0, %lo(src) +; RV64-NEXT: lbu a3, 10(a0) +; RV64-NEXT: lh a0, 8(a0) +; RV64-NEXT: sd a1, %lo(dst)(a2) +; RV64-NEXT: addi a1, a2, %lo(dst) +; RV64-NEXT: sb a3, 10(a1) +; RV64-NEXT: sh a0, 8(a1) +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: t0: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lui a0, %hi(src) +; RV32-FAST-NEXT: lw a1, %lo(src)(a0) +; RV32-FAST-NEXT: lui a2, %hi(dst) +; RV32-FAST-NEXT: addi a0, a0, %lo(src) +; RV32-FAST-NEXT: lw a3, 7(a0) +; RV32-FAST-NEXT: lw a0, 4(a0) +; RV32-FAST-NEXT: sw a1, %lo(dst)(a2) +; RV32-FAST-NEXT: addi a1, a2, %lo(dst) +; RV32-FAST-NEXT: sw a3, 7(a1) +; RV32-FAST-NEXT: sw a0, 4(a1) +; RV32-FAST-NEXT: li a0, 0 +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: t0: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lui a0, %hi(src) +; RV64-FAST-NEXT: ld a1, %lo(src)(a0) +; RV64-FAST-NEXT: addi a0, a0, %lo(src) +; RV64-FAST-NEXT: lw a0, 7(a0) +; RV64-FAST-NEXT: lui a2, %hi(dst) +; RV64-FAST-NEXT: sd a1, %lo(dst)(a2) +; RV64-FAST-NEXT: addi a1, a2, %lo(dst) +; RV64-FAST-NEXT: sw a0, 7(a1) +; RV64-FAST-NEXT: li a0, 0 +; RV64-FAST-NEXT: ret +entry: + call void @llvm.memcpy.p0.p0.i32(ptr align 8 @dst, ptr align 8 @src, i32 11, i1 false) + ret i32 0 +} + +define void @t1(ptr nocapture %C) nounwind { +; RV32-LABEL: t1: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lui a1, %hi(.L.str1) +; RV32-NEXT: addi a1, a1, %lo(.L.str1) +; RV32-NEXT: li a2, 31 +; RV32-NEXT: tail memcpy@plt +; +; RV64-LABEL: t1: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lui a1, %hi(.L.str1) +; RV64-NEXT: addi a1, a1, %lo(.L.str1) +; RV64-NEXT: li a2, 31 +; RV64-NEXT: tail memcpy@plt +; +; RV32-FAST-LABEL: t1: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lui a1, 1141 +; RV32-FAST-NEXT: addi a1, a1, -439 +; RV32-FAST-NEXT: sw a1, 27(a0) +; RV32-FAST-NEXT: lui a1, 300325 +; RV32-FAST-NEXT: addi a1, a1, 1107 +; RV32-FAST-NEXT: sw a1, 24(a0) +; RV32-FAST-NEXT: lui a1, 132181 +; RV32-FAST-NEXT: addi a1, a1, -689 +; RV32-FAST-NEXT: sw a1, 20(a0) +; RV32-FAST-NEXT: lui a1, 340483 +; RV32-FAST-NEXT: addi a1, a1, -947 +; RV32-FAST-NEXT: sw a1, 16(a0) +; RV32-FAST-NEXT: lui a1, 267556 +; RV32-FAST-NEXT: addi a1, a1, 1871 +; RV32-FAST-NEXT: sw a1, 12(a0) +; RV32-FAST-NEXT: lui a1, 337154 +; RV32-FAST-NEXT: addi a1, a1, 69 +; RV32-FAST-NEXT: sw a1, 8(a0) +; RV32-FAST-NEXT: lui a1, 320757 +; RV32-FAST-NEXT: addi a1, a1, 1107 +; RV32-FAST-NEXT: sw a1, 4(a0) +; RV32-FAST-NEXT: lui a1, 365861 +; RV32-FAST-NEXT: addi a1, a1, -1980 +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: t1: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lui a1, %hi(.L.str1) +; RV64-FAST-NEXT: ld a2, %lo(.L.str1)(a1) +; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str1) +; RV64-FAST-NEXT: ld a3, 23(a1) +; RV64-FAST-NEXT: ld a4, 16(a1) +; RV64-FAST-NEXT: ld a1, 8(a1) +; RV64-FAST-NEXT: sd a2, 0(a0) +; RV64-FAST-NEXT: sd a3, 23(a0) +; RV64-FAST-NEXT: sd a4, 16(a0) +; RV64-FAST-NEXT: sd a1, 8(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str1, i64 31, i1 false) + ret void +} + +define void @t2(ptr nocapture %C) nounwind { +; RV32-BOTH-LABEL: t2: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lui a1, %hi(.L.str2) +; RV32-BOTH-NEXT: addi a1, a1, %lo(.L.str2) +; RV32-BOTH-NEXT: li a2, 36 +; RV32-BOTH-NEXT: tail memcpy@plt +; +; RV64-LABEL: t2: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lui a1, %hi(.L.str2) +; RV64-NEXT: addi a1, a1, %lo(.L.str2) +; RV64-NEXT: li a2, 36 +; RV64-NEXT: tail memcpy@plt +; +; RV64-FAST-LABEL: t2: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lui a1, %hi(.L.str2) +; RV64-FAST-NEXT: ld a2, %lo(.L.str2)(a1) +; RV64-FAST-NEXT: sd a2, 0(a0) +; RV64-FAST-NEXT: lui a2, 1156 +; RV64-FAST-NEXT: addiw a2, a2, 332 +; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str2) +; RV64-FAST-NEXT: ld a3, 24(a1) +; RV64-FAST-NEXT: ld a4, 16(a1) +; RV64-FAST-NEXT: ld a1, 8(a1) +; RV64-FAST-NEXT: sw a2, 32(a0) +; RV64-FAST-NEXT: sd a3, 24(a0) +; RV64-FAST-NEXT: sd a4, 16(a0) +; RV64-FAST-NEXT: sd a1, 8(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str2, i64 36, i1 false) + ret void +} + +define void @t3(ptr nocapture %C) nounwind { +; RV32-LABEL: t3: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lui a1, %hi(.L.str3) +; RV32-NEXT: addi a1, a1, %lo(.L.str3) +; RV32-NEXT: li a2, 24 +; RV32-NEXT: tail memcpy@plt +; +; RV64-LABEL: t3: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lui a1, %hi(.L.str3) +; RV64-NEXT: addi a1, a1, %lo(.L.str3) +; RV64-NEXT: li a2, 24 +; RV64-NEXT: tail memcpy@plt +; +; RV32-FAST-LABEL: t3: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lui a1, 1109 +; RV32-FAST-NEXT: addi a1, a1, -689 +; RV32-FAST-NEXT: sw a1, 20(a0) +; RV32-FAST-NEXT: lui a1, 340483 +; RV32-FAST-NEXT: addi a1, a1, -947 +; RV32-FAST-NEXT: sw a1, 16(a0) +; RV32-FAST-NEXT: lui a1, 267556 +; RV32-FAST-NEXT: addi a1, a1, 1871 +; RV32-FAST-NEXT: sw a1, 12(a0) +; RV32-FAST-NEXT: lui a1, 337154 +; RV32-FAST-NEXT: addi a1, a1, 69 +; RV32-FAST-NEXT: sw a1, 8(a0) +; RV32-FAST-NEXT: lui a1, 320757 +; RV32-FAST-NEXT: addi a1, a1, 1107 +; RV32-FAST-NEXT: sw a1, 4(a0) +; RV32-FAST-NEXT: lui a1, 365861 +; RV32-FAST-NEXT: addi a1, a1, -1980 +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: t3: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lui a1, %hi(.L.str3) +; RV64-FAST-NEXT: ld a2, %lo(.L.str3)(a1) +; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str3) +; RV64-FAST-NEXT: ld a3, 16(a1) +; RV64-FAST-NEXT: ld a1, 8(a1) +; RV64-FAST-NEXT: sd a2, 0(a0) +; RV64-FAST-NEXT: sd a3, 16(a0) +; RV64-FAST-NEXT: sd a1, 8(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str3, i64 24, i1 false) + ret void +} + +define void @t4(ptr nocapture %C) nounwind { +; RV32-LABEL: t4: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lui a1, %hi(.L.str4) +; RV32-NEXT: addi a1, a1, %lo(.L.str4) +; RV32-NEXT: li a2, 18 +; RV32-NEXT: tail memcpy@plt +; +; RV64-LABEL: t4: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lui a1, %hi(.L.str4) +; RV64-NEXT: addi a1, a1, %lo(.L.str4) +; RV64-NEXT: li a2, 18 +; RV64-NEXT: tail memcpy@plt +; +; RV32-FAST-LABEL: t4: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: li a1, 32 +; RV32-FAST-NEXT: sh a1, 16(a0) +; RV32-FAST-NEXT: lui a1, 132388 +; RV32-FAST-NEXT: addi a1, a1, 1871 +; RV32-FAST-NEXT: sw a1, 12(a0) +; RV32-FAST-NEXT: lui a1, 337154 +; RV32-FAST-NEXT: addi a1, a1, 69 +; RV32-FAST-NEXT: sw a1, 8(a0) +; RV32-FAST-NEXT: lui a1, 320757 +; RV32-FAST-NEXT: addi a1, a1, 1107 +; RV32-FAST-NEXT: sw a1, 4(a0) +; RV32-FAST-NEXT: lui a1, 365861 +; RV32-FAST-NEXT: addi a1, a1, -1980 +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: t4: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lui a1, %hi(.L.str4) +; RV64-FAST-NEXT: ld a2, %lo(.L.str4)(a1) +; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str4) +; RV64-FAST-NEXT: ld a1, 8(a1) +; RV64-FAST-NEXT: li a3, 32 +; RV64-FAST-NEXT: sh a3, 16(a0) +; RV64-FAST-NEXT: sd a2, 0(a0) +; RV64-FAST-NEXT: sd a1, 8(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str4, i64 18, i1 false) + ret void +} + +define void @t5(ptr nocapture %C) nounwind { +; RV32-LABEL: t5: +; RV32: # %bb.0: # %entry +; RV32-NEXT: sb zero, 6(a0) +; RV32-NEXT: li a1, 84 +; RV32-NEXT: sb a1, 5(a0) +; RV32-NEXT: li a1, 83 +; RV32-NEXT: sb a1, 4(a0) +; RV32-NEXT: li a1, 89 +; RV32-NEXT: sb a1, 3(a0) +; RV32-NEXT: li a1, 82 +; RV32-NEXT: sb a1, 2(a0) +; RV32-NEXT: li a1, 72 +; RV32-NEXT: sb a1, 1(a0) +; RV32-NEXT: li a1, 68 +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: t5: +; RV64: # %bb.0: # %entry +; RV64-NEXT: sb zero, 6(a0) +; RV64-NEXT: li a1, 84 +; RV64-NEXT: sb a1, 5(a0) +; RV64-NEXT: li a1, 83 +; RV64-NEXT: sb a1, 4(a0) +; RV64-NEXT: li a1, 89 +; RV64-NEXT: sb a1, 3(a0) +; RV64-NEXT: li a1, 82 +; RV64-NEXT: sb a1, 2(a0) +; RV64-NEXT: li a1, 72 +; RV64-NEXT: sb a1, 1(a0) +; RV64-NEXT: li a1, 68 +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: t5: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lui a1, 1349 +; RV32-FAST-NEXT: addi a1, a1, 857 +; RV32-FAST-NEXT: sw a1, 3(a0) +; RV32-FAST-NEXT: lui a1, 365861 +; RV32-FAST-NEXT: addi a1, a1, -1980 +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: t5: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lui a1, 1349 +; RV64-FAST-NEXT: addiw a1, a1, 857 +; RV64-FAST-NEXT: sw a1, 3(a0) +; RV64-FAST-NEXT: lui a1, 365861 +; RV64-FAST-NEXT: addiw a1, a1, -1980 +; RV64-FAST-NEXT: sw a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str5, i64 7, i1 false) + ret void +} + +define void @t6() nounwind { +; RV32-LABEL: t6: +; RV32: # %bb.0: # %entry +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: lui a0, %hi(spool.splbuf) +; RV32-NEXT: addi a0, a0, %lo(spool.splbuf) +; RV32-NEXT: lui a1, %hi(.L.str6) +; RV32-NEXT: addi a1, a1, %lo(.L.str6) +; RV32-NEXT: li a2, 14 +; RV32-NEXT: call memcpy@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: t6: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: lui a0, %hi(spool.splbuf) +; RV64-NEXT: addi a0, a0, %lo(spool.splbuf) +; RV64-NEXT: lui a1, %hi(.L.str6) +; RV64-NEXT: addi a1, a1, %lo(.L.str6) +; RV64-NEXT: li a2, 14 +; RV64-NEXT: call memcpy@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: t6: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lui a0, %hi(spool.splbuf) +; RV32-FAST-NEXT: li a1, 88 +; RV32-FAST-NEXT: sh a1, %lo(spool.splbuf+12)(a0) +; RV32-FAST-NEXT: lui a1, 361862 +; RV32-FAST-NEXT: addi a1, a1, -1960 +; RV32-FAST-NEXT: sw a1, %lo(spool.splbuf+8)(a0) +; RV32-FAST-NEXT: lui a1, 362199 +; RV32-FAST-NEXT: addi a1, a1, 559 +; RV32-FAST-NEXT: sw a1, %lo(spool.splbuf+4)(a0) +; RV32-FAST-NEXT: lui a1, 460503 +; RV32-FAST-NEXT: addi a1, a1, 1071 +; RV32-FAST-NEXT: sw a1, %lo(spool.splbuf)(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: t6: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lui a0, %hi(.L.str6) +; RV64-FAST-NEXT: ld a1, %lo(.L.str6)(a0) +; RV64-FAST-NEXT: addi a0, a0, %lo(.L.str6) +; RV64-FAST-NEXT: ld a0, 6(a0) +; RV64-FAST-NEXT: lui a2, %hi(spool.splbuf) +; RV64-FAST-NEXT: sd a1, %lo(spool.splbuf)(a2) +; RV64-FAST-NEXT: sd a0, %lo(spool.splbuf+6)(a2) +; RV64-FAST-NEXT: ret +entry: + call void @llvm.memcpy.p0.p0.i64(ptr @spool.splbuf, ptr @.str6, i64 14, i1 false) + ret void +} + +%struct.Foo = type { i32, i32, i32, i32 } + +define void @t7(ptr nocapture %a, ptr nocapture %b) nounwind { +; RV32-BOTH-LABEL: t7: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lw a2, 12(a1) +; RV32-BOTH-NEXT: sw a2, 12(a0) +; RV32-BOTH-NEXT: lw a2, 8(a1) +; RV32-BOTH-NEXT: sw a2, 8(a0) +; RV32-BOTH-NEXT: lw a2, 4(a1) +; RV32-BOTH-NEXT: sw a2, 4(a0) +; RV32-BOTH-NEXT: lw a1, 0(a1) +; RV32-BOTH-NEXT: sw a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-LABEL: t7: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lw a2, 12(a1) +; RV64-NEXT: sw a2, 12(a0) +; RV64-NEXT: lw a2, 8(a1) +; RV64-NEXT: sw a2, 8(a0) +; RV64-NEXT: lw a2, 4(a1) +; RV64-NEXT: sw a2, 4(a0) +; RV64-NEXT: lw a1, 0(a1) +; RV64-NEXT: sw a1, 0(a0) +; RV64-NEXT: ret +; +; RV64-FAST-LABEL: t7: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.p0.p0.i32(ptr align 4 %a, ptr align 4 %b, i32 16, i1 false) + ret void +} + +declare void @llvm.memcpy.p0.p0.i32(ptr nocapture, ptr nocapture, i32, i1) nounwind +declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV64-BOTH: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll b/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll new file mode 100644 index 0000000..1aa8600 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll @@ -0,0 +1,1678 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v \ +; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v \ +; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+unaligned-scalar-mem \ +; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+unaligned-scalar-mem \ +; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST + +; ---------------------------------------------------------------------- +; Fully unaligned cases + + +define void @unaligned_memcpy1(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: unaligned_memcpy1: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lbu a1, 0(a1) +; RV32-BOTH-NEXT: sb a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: unaligned_memcpy1: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lbu a1, 0(a1) +; RV64-BOTH-NEXT: sb a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 1, i1 false) + ret void +} + +define void @unaligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy2: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy2: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy2: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lh a1, 0(a1) +; RV32-FAST-NEXT: sh a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy2: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lh a1, 0(a1) +; RV64-FAST-NEXT: sh a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false) + ret void +} + +define void @unaligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy3: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 2(a1) +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy3: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 2(a1) +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy3: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lbu a2, 2(a1) +; RV32-FAST-NEXT: sb a2, 2(a0) +; RV32-FAST-NEXT: lh a1, 0(a1) +; RV32-FAST-NEXT: sh a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy3: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lbu a2, 2(a1) +; RV64-FAST-NEXT: sb a2, 2(a0) +; RV64-FAST-NEXT: lh a1, 0(a1) +; RV64-FAST-NEXT: sh a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false) + ret void +} + +define void @unaligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy4: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy4: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy4: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy4: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lw a1, 0(a1) +; RV64-FAST-NEXT: sw a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 4, i1 false) + ret void +} + +define void @unaligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy7: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: addi a1, a1, 4 +; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: addi a0, a0, 4 +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy7: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: addi a1, a1, 4 +; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: addi a0, a0, 4 +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy7: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 3(a1) +; RV32-FAST-NEXT: sw a2, 3(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy7: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lw a2, 3(a1) +; RV64-FAST-NEXT: sw a2, 3(a0) +; RV64-FAST-NEXT: lw a1, 0(a1) +; RV64-FAST-NEXT: sw a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 7, i1 false) + ret void +} + +define void @unaligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy8: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy8: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy8: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy8: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 8, i1 false) + ret void +} + +define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy15: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 14(a1) +; RV32-NEXT: sb a2, 14(a0) +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: addi a2, a1, 12 +; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV32-NEXT: vle8.v v8, (a2) +; RV32-NEXT: addi a2, a0, 12 +; RV32-NEXT: vse8.v v8, (a2) +; RV32-NEXT: addi a1, a1, 8 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: addi a0, a0, 8 +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy15: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 14(a1) +; RV64-NEXT: sb a2, 14(a0) +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: addi a2, a1, 12 +; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV64-NEXT: vle8.v v8, (a2) +; RV64-NEXT: addi a2, a0, 12 +; RV64-NEXT: vse8.v v8, (a2) +; RV64-NEXT: addi a1, a1, 8 +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: addi a0, a0, 8 +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy15: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 11(a1) +; RV32-FAST-NEXT: sw a2, 11(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy15: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: ld a2, 7(a1) +; RV64-FAST-NEXT: sd a2, 7(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 15, i1 false) + ret void +} + +define void @unaligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy16: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy16: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy16: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 12(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy16: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 16, i1 false) + ret void +} + +define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy31: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 30(a1) +; RV32-NEXT: sb a2, 30(a0) +; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: addi a2, a1, 28 +; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV32-NEXT: vle8.v v8, (a2) +; RV32-NEXT: addi a2, a0, 28 +; RV32-NEXT: vse8.v v8, (a2) +; RV32-NEXT: addi a2, a1, 24 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vle8.v v8, (a2) +; RV32-NEXT: addi a2, a0, 24 +; RV32-NEXT: vse8.v v8, (a2) +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy31: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 30(a1) +; RV64-NEXT: sb a2, 30(a0) +; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: addi a2, a1, 28 +; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV64-NEXT: vle8.v v8, (a2) +; RV64-NEXT: addi a2, a0, 28 +; RV64-NEXT: vse8.v v8, (a2) +; RV64-NEXT: addi a2, a1, 24 +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vle8.v v8, (a2) +; RV64-NEXT: addi a2, a0, 24 +; RV64-NEXT: vse8.v v8, (a2) +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy31: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 27(a1) +; RV32-FAST-NEXT: sw a2, 27(a0) +; RV32-FAST-NEXT: lw a2, 24(a1) +; RV32-FAST-NEXT: sw a2, 24(a0) +; RV32-FAST-NEXT: lw a2, 20(a1) +; RV32-FAST-NEXT: sw a2, 20(a0) +; RV32-FAST-NEXT: lw a2, 16(a1) +; RV32-FAST-NEXT: sw a2, 16(a0) +; RV32-FAST-NEXT: lw a2, 12(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy31: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: ld a2, 23(a1) +; RV64-FAST-NEXT: sd a2, 23(a0) +; RV64-FAST-NEXT: ld a2, 16(a1) +; RV64-FAST-NEXT: sd a2, 16(a0) +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 31, i1 false) + ret void +} + +define void @unaligned_memcpy32(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy32: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a2, 32 +; RV64-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy32: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 28(a1) +; RV32-FAST-NEXT: sw a2, 28(a0) +; RV32-FAST-NEXT: lw a2, 24(a1) +; RV32-FAST-NEXT: sw a2, 24(a0) +; RV32-FAST-NEXT: lw a2, 20(a1) +; RV32-FAST-NEXT: sw a2, 20(a0) +; RV32-FAST-NEXT: lw a2, 16(a1) +; RV32-FAST-NEXT: sw a2, 16(a0) +; RV32-FAST-NEXT: lw a2, 12(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy32: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: ld a2, 24(a1) +; RV64-FAST-NEXT: sd a2, 24(a0) +; RV64-FAST-NEXT: ld a2, 16(a1) +; RV64-FAST-NEXT: sd a2, 16(a0) +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 32, i1 false) + ret void +} + +define void @unaligned_memcpy64(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy64: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a2, 64 +; RV32-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a2, 64 +; RV64-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy64: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 60(a1) +; RV32-FAST-NEXT: sw a2, 60(a0) +; RV32-FAST-NEXT: lw a2, 56(a1) +; RV32-FAST-NEXT: sw a2, 56(a0) +; RV32-FAST-NEXT: lw a2, 52(a1) +; RV32-FAST-NEXT: sw a2, 52(a0) +; RV32-FAST-NEXT: lw a2, 48(a1) +; RV32-FAST-NEXT: sw a2, 48(a0) +; RV32-FAST-NEXT: lw a2, 44(a1) +; RV32-FAST-NEXT: sw a2, 44(a0) +; RV32-FAST-NEXT: lw a2, 40(a1) +; RV32-FAST-NEXT: sw a2, 40(a0) +; RV32-FAST-NEXT: lw a2, 36(a1) +; RV32-FAST-NEXT: sw a2, 36(a0) +; RV32-FAST-NEXT: lw a2, 32(a1) +; RV32-FAST-NEXT: sw a2, 32(a0) +; RV32-FAST-NEXT: lw a2, 28(a1) +; RV32-FAST-NEXT: sw a2, 28(a0) +; RV32-FAST-NEXT: lw a2, 24(a1) +; RV32-FAST-NEXT: sw a2, 24(a0) +; RV32-FAST-NEXT: lw a2, 20(a1) +; RV32-FAST-NEXT: sw a2, 20(a0) +; RV32-FAST-NEXT: lw a2, 16(a1) +; RV32-FAST-NEXT: sw a2, 16(a0) +; RV32-FAST-NEXT: lw a2, 12(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy64: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: ld a2, 56(a1) +; RV64-FAST-NEXT: sd a2, 56(a0) +; RV64-FAST-NEXT: ld a2, 48(a1) +; RV64-FAST-NEXT: sd a2, 48(a0) +; RV64-FAST-NEXT: ld a2, 40(a1) +; RV64-FAST-NEXT: sd a2, 40(a0) +; RV64-FAST-NEXT: ld a2, 32(a1) +; RV64-FAST-NEXT: sd a2, 32(a0) +; RV64-FAST-NEXT: ld a2, 24(a1) +; RV64-FAST-NEXT: sd a2, 24(a0) +; RV64-FAST-NEXT: ld a2, 16(a1) +; RV64-FAST-NEXT: sd a2, 16(a0) +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 64, i1 false) + ret void +} + +define void @unaligned_memcpy96(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy96: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a2, 64 +; RV32-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: addi a1, a1, 64 +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: addi a0, a0, 64 +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy96: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a2, 64 +; RV64-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: addi a1, a1, 64 +; RV64-NEXT: li a2, 32 +; RV64-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: addi a0, a0, 64 +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy96: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 92(a1) +; RV32-FAST-NEXT: sw a2, 92(a0) +; RV32-FAST-NEXT: lw a2, 88(a1) +; RV32-FAST-NEXT: sw a2, 88(a0) +; RV32-FAST-NEXT: lw a2, 84(a1) +; RV32-FAST-NEXT: sw a2, 84(a0) +; RV32-FAST-NEXT: lw a2, 80(a1) +; RV32-FAST-NEXT: sw a2, 80(a0) +; RV32-FAST-NEXT: lw a2, 76(a1) +; RV32-FAST-NEXT: sw a2, 76(a0) +; RV32-FAST-NEXT: lw a2, 72(a1) +; RV32-FAST-NEXT: sw a2, 72(a0) +; RV32-FAST-NEXT: lw a2, 68(a1) +; RV32-FAST-NEXT: sw a2, 68(a0) +; RV32-FAST-NEXT: lw a2, 64(a1) +; RV32-FAST-NEXT: sw a2, 64(a0) +; RV32-FAST-NEXT: lw a2, 60(a1) +; RV32-FAST-NEXT: sw a2, 60(a0) +; RV32-FAST-NEXT: lw a2, 56(a1) +; RV32-FAST-NEXT: sw a2, 56(a0) +; RV32-FAST-NEXT: lw a2, 52(a1) +; RV32-FAST-NEXT: sw a2, 52(a0) +; RV32-FAST-NEXT: lw a2, 48(a1) +; RV32-FAST-NEXT: sw a2, 48(a0) +; RV32-FAST-NEXT: lw a2, 44(a1) +; RV32-FAST-NEXT: sw a2, 44(a0) +; RV32-FAST-NEXT: lw a2, 40(a1) +; RV32-FAST-NEXT: sw a2, 40(a0) +; RV32-FAST-NEXT: lw a2, 36(a1) +; RV32-FAST-NEXT: sw a2, 36(a0) +; RV32-FAST-NEXT: lw a2, 32(a1) +; RV32-FAST-NEXT: sw a2, 32(a0) +; RV32-FAST-NEXT: lw a2, 28(a1) +; RV32-FAST-NEXT: sw a2, 28(a0) +; RV32-FAST-NEXT: lw a2, 24(a1) +; RV32-FAST-NEXT: sw a2, 24(a0) +; RV32-FAST-NEXT: lw a2, 20(a1) +; RV32-FAST-NEXT: sw a2, 20(a0) +; RV32-FAST-NEXT: lw a2, 16(a1) +; RV32-FAST-NEXT: sw a2, 16(a0) +; RV32-FAST-NEXT: lw a2, 12(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy96: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: ld a2, 88(a1) +; RV64-FAST-NEXT: sd a2, 88(a0) +; RV64-FAST-NEXT: ld a2, 80(a1) +; RV64-FAST-NEXT: sd a2, 80(a0) +; RV64-FAST-NEXT: ld a2, 72(a1) +; RV64-FAST-NEXT: sd a2, 72(a0) +; RV64-FAST-NEXT: ld a2, 64(a1) +; RV64-FAST-NEXT: sd a2, 64(a0) +; RV64-FAST-NEXT: ld a2, 56(a1) +; RV64-FAST-NEXT: sd a2, 56(a0) +; RV64-FAST-NEXT: ld a2, 48(a1) +; RV64-FAST-NEXT: sd a2, 48(a0) +; RV64-FAST-NEXT: ld a2, 40(a1) +; RV64-FAST-NEXT: sd a2, 40(a0) +; RV64-FAST-NEXT: ld a2, 32(a1) +; RV64-FAST-NEXT: sd a2, 32(a0) +; RV64-FAST-NEXT: ld a2, 24(a1) +; RV64-FAST-NEXT: sd a2, 24(a0) +; RV64-FAST-NEXT: ld a2, 16(a1) +; RV64-FAST-NEXT: sd a2, 16(a0) +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 96, i1 false) + ret void +} + +define void @unaligned_memcpy128(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy128: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a2, 128 +; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy128: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a2, 128 +; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy128: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 124(a1) +; RV32-FAST-NEXT: sw a2, 124(a0) +; RV32-FAST-NEXT: lw a2, 120(a1) +; RV32-FAST-NEXT: sw a2, 120(a0) +; RV32-FAST-NEXT: lw a2, 116(a1) +; RV32-FAST-NEXT: sw a2, 116(a0) +; RV32-FAST-NEXT: lw a2, 112(a1) +; RV32-FAST-NEXT: sw a2, 112(a0) +; RV32-FAST-NEXT: lw a2, 108(a1) +; RV32-FAST-NEXT: sw a2, 108(a0) +; RV32-FAST-NEXT: lw a2, 104(a1) +; RV32-FAST-NEXT: sw a2, 104(a0) +; RV32-FAST-NEXT: lw a2, 100(a1) +; RV32-FAST-NEXT: sw a2, 100(a0) +; RV32-FAST-NEXT: lw a2, 96(a1) +; RV32-FAST-NEXT: sw a2, 96(a0) +; RV32-FAST-NEXT: lw a2, 92(a1) +; RV32-FAST-NEXT: sw a2, 92(a0) +; RV32-FAST-NEXT: lw a2, 88(a1) +; RV32-FAST-NEXT: sw a2, 88(a0) +; RV32-FAST-NEXT: lw a2, 84(a1) +; RV32-FAST-NEXT: sw a2, 84(a0) +; RV32-FAST-NEXT: lw a2, 80(a1) +; RV32-FAST-NEXT: sw a2, 80(a0) +; RV32-FAST-NEXT: lw a2, 76(a1) +; RV32-FAST-NEXT: sw a2, 76(a0) +; RV32-FAST-NEXT: lw a2, 72(a1) +; RV32-FAST-NEXT: sw a2, 72(a0) +; RV32-FAST-NEXT: lw a2, 68(a1) +; RV32-FAST-NEXT: sw a2, 68(a0) +; RV32-FAST-NEXT: lw a2, 64(a1) +; RV32-FAST-NEXT: sw a2, 64(a0) +; RV32-FAST-NEXT: lw a2, 60(a1) +; RV32-FAST-NEXT: sw a2, 60(a0) +; RV32-FAST-NEXT: lw a2, 56(a1) +; RV32-FAST-NEXT: sw a2, 56(a0) +; RV32-FAST-NEXT: lw a2, 52(a1) +; RV32-FAST-NEXT: sw a2, 52(a0) +; RV32-FAST-NEXT: lw a2, 48(a1) +; RV32-FAST-NEXT: sw a2, 48(a0) +; RV32-FAST-NEXT: lw a2, 44(a1) +; RV32-FAST-NEXT: sw a2, 44(a0) +; RV32-FAST-NEXT: lw a2, 40(a1) +; RV32-FAST-NEXT: sw a2, 40(a0) +; RV32-FAST-NEXT: lw a2, 36(a1) +; RV32-FAST-NEXT: sw a2, 36(a0) +; RV32-FAST-NEXT: lw a2, 32(a1) +; RV32-FAST-NEXT: sw a2, 32(a0) +; RV32-FAST-NEXT: lw a2, 28(a1) +; RV32-FAST-NEXT: sw a2, 28(a0) +; RV32-FAST-NEXT: lw a2, 24(a1) +; RV32-FAST-NEXT: sw a2, 24(a0) +; RV32-FAST-NEXT: lw a2, 20(a1) +; RV32-FAST-NEXT: sw a2, 20(a0) +; RV32-FAST-NEXT: lw a2, 16(a1) +; RV32-FAST-NEXT: sw a2, 16(a0) +; RV32-FAST-NEXT: lw a2, 12(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy128: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: ld a2, 120(a1) +; RV64-FAST-NEXT: sd a2, 120(a0) +; RV64-FAST-NEXT: ld a2, 112(a1) +; RV64-FAST-NEXT: sd a2, 112(a0) +; RV64-FAST-NEXT: ld a2, 104(a1) +; RV64-FAST-NEXT: sd a2, 104(a0) +; RV64-FAST-NEXT: ld a2, 96(a1) +; RV64-FAST-NEXT: sd a2, 96(a0) +; RV64-FAST-NEXT: ld a2, 88(a1) +; RV64-FAST-NEXT: sd a2, 88(a0) +; RV64-FAST-NEXT: ld a2, 80(a1) +; RV64-FAST-NEXT: sd a2, 80(a0) +; RV64-FAST-NEXT: ld a2, 72(a1) +; RV64-FAST-NEXT: sd a2, 72(a0) +; RV64-FAST-NEXT: ld a2, 64(a1) +; RV64-FAST-NEXT: sd a2, 64(a0) +; RV64-FAST-NEXT: ld a2, 56(a1) +; RV64-FAST-NEXT: sd a2, 56(a0) +; RV64-FAST-NEXT: ld a2, 48(a1) +; RV64-FAST-NEXT: sd a2, 48(a0) +; RV64-FAST-NEXT: ld a2, 40(a1) +; RV64-FAST-NEXT: sd a2, 40(a0) +; RV64-FAST-NEXT: ld a2, 32(a1) +; RV64-FAST-NEXT: sd a2, 32(a0) +; RV64-FAST-NEXT: ld a2, 24(a1) +; RV64-FAST-NEXT: sd a2, 24(a0) +; RV64-FAST-NEXT: ld a2, 16(a1) +; RV64-FAST-NEXT: sd a2, 16(a0) +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 128, i1 false) + ret void +} + +define void @unaligned_memcpy196(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy196: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a2, 128 +; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: addi a2, a1, 128 +; RV32-NEXT: li a3, 64 +; RV32-NEXT: vsetvli zero, a3, e8, m4, ta, ma +; RV32-NEXT: vle8.v v8, (a2) +; RV32-NEXT: addi a2, a0, 128 +; RV32-NEXT: vse8.v v8, (a2) +; RV32-NEXT: addi a1, a1, 192 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: addi a0, a0, 192 +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy196: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a2, 128 +; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: addi a2, a1, 128 +; RV64-NEXT: li a3, 64 +; RV64-NEXT: vsetvli zero, a3, e8, m4, ta, ma +; RV64-NEXT: vle8.v v8, (a2) +; RV64-NEXT: addi a2, a0, 128 +; RV64-NEXT: vse8.v v8, (a2) +; RV64-NEXT: addi a1, a1, 192 +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: addi a0, a0, 192 +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy196: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 192(a1) +; RV32-FAST-NEXT: sw a2, 192(a0) +; RV32-FAST-NEXT: lw a2, 188(a1) +; RV32-FAST-NEXT: sw a2, 188(a0) +; RV32-FAST-NEXT: lw a2, 184(a1) +; RV32-FAST-NEXT: sw a2, 184(a0) +; RV32-FAST-NEXT: lw a2, 180(a1) +; RV32-FAST-NEXT: sw a2, 180(a0) +; RV32-FAST-NEXT: lw a2, 176(a1) +; RV32-FAST-NEXT: sw a2, 176(a0) +; RV32-FAST-NEXT: lw a2, 172(a1) +; RV32-FAST-NEXT: sw a2, 172(a0) +; RV32-FAST-NEXT: lw a2, 168(a1) +; RV32-FAST-NEXT: sw a2, 168(a0) +; RV32-FAST-NEXT: lw a2, 164(a1) +; RV32-FAST-NEXT: sw a2, 164(a0) +; RV32-FAST-NEXT: lw a2, 160(a1) +; RV32-FAST-NEXT: sw a2, 160(a0) +; RV32-FAST-NEXT: lw a2, 156(a1) +; RV32-FAST-NEXT: sw a2, 156(a0) +; RV32-FAST-NEXT: lw a2, 152(a1) +; RV32-FAST-NEXT: sw a2, 152(a0) +; RV32-FAST-NEXT: lw a2, 148(a1) +; RV32-FAST-NEXT: sw a2, 148(a0) +; RV32-FAST-NEXT: lw a2, 144(a1) +; RV32-FAST-NEXT: sw a2, 144(a0) +; RV32-FAST-NEXT: lw a2, 140(a1) +; RV32-FAST-NEXT: sw a2, 140(a0) +; RV32-FAST-NEXT: lw a2, 136(a1) +; RV32-FAST-NEXT: sw a2, 136(a0) +; RV32-FAST-NEXT: lw a2, 132(a1) +; RV32-FAST-NEXT: sw a2, 132(a0) +; RV32-FAST-NEXT: lw a2, 128(a1) +; RV32-FAST-NEXT: sw a2, 128(a0) +; RV32-FAST-NEXT: lw a2, 124(a1) +; RV32-FAST-NEXT: sw a2, 124(a0) +; RV32-FAST-NEXT: lw a2, 120(a1) +; RV32-FAST-NEXT: sw a2, 120(a0) +; RV32-FAST-NEXT: lw a2, 116(a1) +; RV32-FAST-NEXT: sw a2, 116(a0) +; RV32-FAST-NEXT: lw a2, 112(a1) +; RV32-FAST-NEXT: sw a2, 112(a0) +; RV32-FAST-NEXT: lw a2, 108(a1) +; RV32-FAST-NEXT: sw a2, 108(a0) +; RV32-FAST-NEXT: lw a2, 104(a1) +; RV32-FAST-NEXT: sw a2, 104(a0) +; RV32-FAST-NEXT: lw a2, 100(a1) +; RV32-FAST-NEXT: sw a2, 100(a0) +; RV32-FAST-NEXT: lw a2, 96(a1) +; RV32-FAST-NEXT: sw a2, 96(a0) +; RV32-FAST-NEXT: lw a2, 92(a1) +; RV32-FAST-NEXT: sw a2, 92(a0) +; RV32-FAST-NEXT: lw a2, 88(a1) +; RV32-FAST-NEXT: sw a2, 88(a0) +; RV32-FAST-NEXT: lw a2, 84(a1) +; RV32-FAST-NEXT: sw a2, 84(a0) +; RV32-FAST-NEXT: lw a2, 80(a1) +; RV32-FAST-NEXT: sw a2, 80(a0) +; RV32-FAST-NEXT: lw a2, 76(a1) +; RV32-FAST-NEXT: sw a2, 76(a0) +; RV32-FAST-NEXT: lw a2, 72(a1) +; RV32-FAST-NEXT: sw a2, 72(a0) +; RV32-FAST-NEXT: lw a2, 68(a1) +; RV32-FAST-NEXT: sw a2, 68(a0) +; RV32-FAST-NEXT: lw a2, 64(a1) +; RV32-FAST-NEXT: sw a2, 64(a0) +; RV32-FAST-NEXT: lw a2, 60(a1) +; RV32-FAST-NEXT: sw a2, 60(a0) +; RV32-FAST-NEXT: lw a2, 56(a1) +; RV32-FAST-NEXT: sw a2, 56(a0) +; RV32-FAST-NEXT: lw a2, 52(a1) +; RV32-FAST-NEXT: sw a2, 52(a0) +; RV32-FAST-NEXT: lw a2, 48(a1) +; RV32-FAST-NEXT: sw a2, 48(a0) +; RV32-FAST-NEXT: lw a2, 44(a1) +; RV32-FAST-NEXT: sw a2, 44(a0) +; RV32-FAST-NEXT: lw a2, 40(a1) +; RV32-FAST-NEXT: sw a2, 40(a0) +; RV32-FAST-NEXT: lw a2, 36(a1) +; RV32-FAST-NEXT: sw a2, 36(a0) +; RV32-FAST-NEXT: lw a2, 32(a1) +; RV32-FAST-NEXT: sw a2, 32(a0) +; RV32-FAST-NEXT: lw a2, 28(a1) +; RV32-FAST-NEXT: sw a2, 28(a0) +; RV32-FAST-NEXT: lw a2, 24(a1) +; RV32-FAST-NEXT: sw a2, 24(a0) +; RV32-FAST-NEXT: lw a2, 20(a1) +; RV32-FAST-NEXT: sw a2, 20(a0) +; RV32-FAST-NEXT: lw a2, 16(a1) +; RV32-FAST-NEXT: sw a2, 16(a0) +; RV32-FAST-NEXT: lw a2, 12(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy196: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lw a2, 192(a1) +; RV64-FAST-NEXT: sw a2, 192(a0) +; RV64-FAST-NEXT: ld a2, 184(a1) +; RV64-FAST-NEXT: sd a2, 184(a0) +; RV64-FAST-NEXT: ld a2, 176(a1) +; RV64-FAST-NEXT: sd a2, 176(a0) +; RV64-FAST-NEXT: ld a2, 168(a1) +; RV64-FAST-NEXT: sd a2, 168(a0) +; RV64-FAST-NEXT: ld a2, 160(a1) +; RV64-FAST-NEXT: sd a2, 160(a0) +; RV64-FAST-NEXT: ld a2, 152(a1) +; RV64-FAST-NEXT: sd a2, 152(a0) +; RV64-FAST-NEXT: ld a2, 144(a1) +; RV64-FAST-NEXT: sd a2, 144(a0) +; RV64-FAST-NEXT: ld a2, 136(a1) +; RV64-FAST-NEXT: sd a2, 136(a0) +; RV64-FAST-NEXT: ld a2, 128(a1) +; RV64-FAST-NEXT: sd a2, 128(a0) +; RV64-FAST-NEXT: ld a2, 120(a1) +; RV64-FAST-NEXT: sd a2, 120(a0) +; RV64-FAST-NEXT: ld a2, 112(a1) +; RV64-FAST-NEXT: sd a2, 112(a0) +; RV64-FAST-NEXT: ld a2, 104(a1) +; RV64-FAST-NEXT: sd a2, 104(a0) +; RV64-FAST-NEXT: ld a2, 96(a1) +; RV64-FAST-NEXT: sd a2, 96(a0) +; RV64-FAST-NEXT: ld a2, 88(a1) +; RV64-FAST-NEXT: sd a2, 88(a0) +; RV64-FAST-NEXT: ld a2, 80(a1) +; RV64-FAST-NEXT: sd a2, 80(a0) +; RV64-FAST-NEXT: ld a2, 72(a1) +; RV64-FAST-NEXT: sd a2, 72(a0) +; RV64-FAST-NEXT: ld a2, 64(a1) +; RV64-FAST-NEXT: sd a2, 64(a0) +; RV64-FAST-NEXT: ld a2, 56(a1) +; RV64-FAST-NEXT: sd a2, 56(a0) +; RV64-FAST-NEXT: ld a2, 48(a1) +; RV64-FAST-NEXT: sd a2, 48(a0) +; RV64-FAST-NEXT: ld a2, 40(a1) +; RV64-FAST-NEXT: sd a2, 40(a0) +; RV64-FAST-NEXT: ld a2, 32(a1) +; RV64-FAST-NEXT: sd a2, 32(a0) +; RV64-FAST-NEXT: ld a2, 24(a1) +; RV64-FAST-NEXT: sd a2, 24(a0) +; RV64-FAST-NEXT: ld a2, 16(a1) +; RV64-FAST-NEXT: sd a2, 16(a0) +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 196, i1 false) + ret void +} + +define void @unaligned_memcpy256(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: unaligned_memcpy256: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a2, 128 +; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: addi a1, a1, 128 +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: addi a0, a0, 128 +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: unaligned_memcpy256: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a2, 128 +; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: addi a1, a1, 128 +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: addi a0, a0, 128 +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: unaligned_memcpy256: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 252(a1) +; RV32-FAST-NEXT: sw a2, 252(a0) +; RV32-FAST-NEXT: lw a2, 248(a1) +; RV32-FAST-NEXT: sw a2, 248(a0) +; RV32-FAST-NEXT: lw a2, 244(a1) +; RV32-FAST-NEXT: sw a2, 244(a0) +; RV32-FAST-NEXT: lw a2, 240(a1) +; RV32-FAST-NEXT: sw a2, 240(a0) +; RV32-FAST-NEXT: lw a2, 236(a1) +; RV32-FAST-NEXT: sw a2, 236(a0) +; RV32-FAST-NEXT: lw a2, 232(a1) +; RV32-FAST-NEXT: sw a2, 232(a0) +; RV32-FAST-NEXT: lw a2, 228(a1) +; RV32-FAST-NEXT: sw a2, 228(a0) +; RV32-FAST-NEXT: lw a2, 224(a1) +; RV32-FAST-NEXT: sw a2, 224(a0) +; RV32-FAST-NEXT: lw a2, 220(a1) +; RV32-FAST-NEXT: sw a2, 220(a0) +; RV32-FAST-NEXT: lw a2, 216(a1) +; RV32-FAST-NEXT: sw a2, 216(a0) +; RV32-FAST-NEXT: lw a2, 212(a1) +; RV32-FAST-NEXT: sw a2, 212(a0) +; RV32-FAST-NEXT: lw a2, 208(a1) +; RV32-FAST-NEXT: sw a2, 208(a0) +; RV32-FAST-NEXT: lw a2, 204(a1) +; RV32-FAST-NEXT: sw a2, 204(a0) +; RV32-FAST-NEXT: lw a2, 200(a1) +; RV32-FAST-NEXT: sw a2, 200(a0) +; RV32-FAST-NEXT: lw a2, 196(a1) +; RV32-FAST-NEXT: sw a2, 196(a0) +; RV32-FAST-NEXT: lw a2, 192(a1) +; RV32-FAST-NEXT: sw a2, 192(a0) +; RV32-FAST-NEXT: lw a2, 188(a1) +; RV32-FAST-NEXT: sw a2, 188(a0) +; RV32-FAST-NEXT: lw a2, 184(a1) +; RV32-FAST-NEXT: sw a2, 184(a0) +; RV32-FAST-NEXT: lw a2, 180(a1) +; RV32-FAST-NEXT: sw a2, 180(a0) +; RV32-FAST-NEXT: lw a2, 176(a1) +; RV32-FAST-NEXT: sw a2, 176(a0) +; RV32-FAST-NEXT: lw a2, 172(a1) +; RV32-FAST-NEXT: sw a2, 172(a0) +; RV32-FAST-NEXT: lw a2, 168(a1) +; RV32-FAST-NEXT: sw a2, 168(a0) +; RV32-FAST-NEXT: lw a2, 164(a1) +; RV32-FAST-NEXT: sw a2, 164(a0) +; RV32-FAST-NEXT: lw a2, 160(a1) +; RV32-FAST-NEXT: sw a2, 160(a0) +; RV32-FAST-NEXT: lw a2, 156(a1) +; RV32-FAST-NEXT: sw a2, 156(a0) +; RV32-FAST-NEXT: lw a2, 152(a1) +; RV32-FAST-NEXT: sw a2, 152(a0) +; RV32-FAST-NEXT: lw a2, 148(a1) +; RV32-FAST-NEXT: sw a2, 148(a0) +; RV32-FAST-NEXT: lw a2, 144(a1) +; RV32-FAST-NEXT: sw a2, 144(a0) +; RV32-FAST-NEXT: lw a2, 140(a1) +; RV32-FAST-NEXT: sw a2, 140(a0) +; RV32-FAST-NEXT: lw a2, 136(a1) +; RV32-FAST-NEXT: sw a2, 136(a0) +; RV32-FAST-NEXT: lw a2, 132(a1) +; RV32-FAST-NEXT: sw a2, 132(a0) +; RV32-FAST-NEXT: lw a2, 128(a1) +; RV32-FAST-NEXT: sw a2, 128(a0) +; RV32-FAST-NEXT: lw a2, 124(a1) +; RV32-FAST-NEXT: sw a2, 124(a0) +; RV32-FAST-NEXT: lw a2, 120(a1) +; RV32-FAST-NEXT: sw a2, 120(a0) +; RV32-FAST-NEXT: lw a2, 116(a1) +; RV32-FAST-NEXT: sw a2, 116(a0) +; RV32-FAST-NEXT: lw a2, 112(a1) +; RV32-FAST-NEXT: sw a2, 112(a0) +; RV32-FAST-NEXT: lw a2, 108(a1) +; RV32-FAST-NEXT: sw a2, 108(a0) +; RV32-FAST-NEXT: lw a2, 104(a1) +; RV32-FAST-NEXT: sw a2, 104(a0) +; RV32-FAST-NEXT: lw a2, 100(a1) +; RV32-FAST-NEXT: sw a2, 100(a0) +; RV32-FAST-NEXT: lw a2, 96(a1) +; RV32-FAST-NEXT: sw a2, 96(a0) +; RV32-FAST-NEXT: lw a2, 92(a1) +; RV32-FAST-NEXT: sw a2, 92(a0) +; RV32-FAST-NEXT: lw a2, 88(a1) +; RV32-FAST-NEXT: sw a2, 88(a0) +; RV32-FAST-NEXT: lw a2, 84(a1) +; RV32-FAST-NEXT: sw a2, 84(a0) +; RV32-FAST-NEXT: lw a2, 80(a1) +; RV32-FAST-NEXT: sw a2, 80(a0) +; RV32-FAST-NEXT: lw a2, 76(a1) +; RV32-FAST-NEXT: sw a2, 76(a0) +; RV32-FAST-NEXT: lw a2, 72(a1) +; RV32-FAST-NEXT: sw a2, 72(a0) +; RV32-FAST-NEXT: lw a2, 68(a1) +; RV32-FAST-NEXT: sw a2, 68(a0) +; RV32-FAST-NEXT: lw a2, 64(a1) +; RV32-FAST-NEXT: sw a2, 64(a0) +; RV32-FAST-NEXT: lw a2, 60(a1) +; RV32-FAST-NEXT: sw a2, 60(a0) +; RV32-FAST-NEXT: lw a2, 56(a1) +; RV32-FAST-NEXT: sw a2, 56(a0) +; RV32-FAST-NEXT: lw a2, 52(a1) +; RV32-FAST-NEXT: sw a2, 52(a0) +; RV32-FAST-NEXT: lw a2, 48(a1) +; RV32-FAST-NEXT: sw a2, 48(a0) +; RV32-FAST-NEXT: lw a2, 44(a1) +; RV32-FAST-NEXT: sw a2, 44(a0) +; RV32-FAST-NEXT: lw a2, 40(a1) +; RV32-FAST-NEXT: sw a2, 40(a0) +; RV32-FAST-NEXT: lw a2, 36(a1) +; RV32-FAST-NEXT: sw a2, 36(a0) +; RV32-FAST-NEXT: lw a2, 32(a1) +; RV32-FAST-NEXT: sw a2, 32(a0) +; RV32-FAST-NEXT: lw a2, 28(a1) +; RV32-FAST-NEXT: sw a2, 28(a0) +; RV32-FAST-NEXT: lw a2, 24(a1) +; RV32-FAST-NEXT: sw a2, 24(a0) +; RV32-FAST-NEXT: lw a2, 20(a1) +; RV32-FAST-NEXT: sw a2, 20(a0) +; RV32-FAST-NEXT: lw a2, 16(a1) +; RV32-FAST-NEXT: sw a2, 16(a0) +; RV32-FAST-NEXT: lw a2, 12(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: unaligned_memcpy256: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: ld a2, 248(a1) +; RV64-FAST-NEXT: sd a2, 248(a0) +; RV64-FAST-NEXT: ld a2, 240(a1) +; RV64-FAST-NEXT: sd a2, 240(a0) +; RV64-FAST-NEXT: ld a2, 232(a1) +; RV64-FAST-NEXT: sd a2, 232(a0) +; RV64-FAST-NEXT: ld a2, 224(a1) +; RV64-FAST-NEXT: sd a2, 224(a0) +; RV64-FAST-NEXT: ld a2, 216(a1) +; RV64-FAST-NEXT: sd a2, 216(a0) +; RV64-FAST-NEXT: ld a2, 208(a1) +; RV64-FAST-NEXT: sd a2, 208(a0) +; RV64-FAST-NEXT: ld a2, 200(a1) +; RV64-FAST-NEXT: sd a2, 200(a0) +; RV64-FAST-NEXT: ld a2, 192(a1) +; RV64-FAST-NEXT: sd a2, 192(a0) +; RV64-FAST-NEXT: ld a2, 184(a1) +; RV64-FAST-NEXT: sd a2, 184(a0) +; RV64-FAST-NEXT: ld a2, 176(a1) +; RV64-FAST-NEXT: sd a2, 176(a0) +; RV64-FAST-NEXT: ld a2, 168(a1) +; RV64-FAST-NEXT: sd a2, 168(a0) +; RV64-FAST-NEXT: ld a2, 160(a1) +; RV64-FAST-NEXT: sd a2, 160(a0) +; RV64-FAST-NEXT: ld a2, 152(a1) +; RV64-FAST-NEXT: sd a2, 152(a0) +; RV64-FAST-NEXT: ld a2, 144(a1) +; RV64-FAST-NEXT: sd a2, 144(a0) +; RV64-FAST-NEXT: ld a2, 136(a1) +; RV64-FAST-NEXT: sd a2, 136(a0) +; RV64-FAST-NEXT: ld a2, 128(a1) +; RV64-FAST-NEXT: sd a2, 128(a0) +; RV64-FAST-NEXT: ld a2, 120(a1) +; RV64-FAST-NEXT: sd a2, 120(a0) +; RV64-FAST-NEXT: ld a2, 112(a1) +; RV64-FAST-NEXT: sd a2, 112(a0) +; RV64-FAST-NEXT: ld a2, 104(a1) +; RV64-FAST-NEXT: sd a2, 104(a0) +; RV64-FAST-NEXT: ld a2, 96(a1) +; RV64-FAST-NEXT: sd a2, 96(a0) +; RV64-FAST-NEXT: ld a2, 88(a1) +; RV64-FAST-NEXT: sd a2, 88(a0) +; RV64-FAST-NEXT: ld a2, 80(a1) +; RV64-FAST-NEXT: sd a2, 80(a0) +; RV64-FAST-NEXT: ld a2, 72(a1) +; RV64-FAST-NEXT: sd a2, 72(a0) +; RV64-FAST-NEXT: ld a2, 64(a1) +; RV64-FAST-NEXT: sd a2, 64(a0) +; RV64-FAST-NEXT: ld a2, 56(a1) +; RV64-FAST-NEXT: sd a2, 56(a0) +; RV64-FAST-NEXT: ld a2, 48(a1) +; RV64-FAST-NEXT: sd a2, 48(a0) +; RV64-FAST-NEXT: ld a2, 40(a1) +; RV64-FAST-NEXT: sd a2, 40(a0) +; RV64-FAST-NEXT: ld a2, 32(a1) +; RV64-FAST-NEXT: sd a2, 32(a0) +; RV64-FAST-NEXT: ld a2, 24(a1) +; RV64-FAST-NEXT: sd a2, 24(a0) +; RV64-FAST-NEXT: ld a2, 16(a1) +; RV64-FAST-NEXT: sd a2, 16(a0) +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 256, i1 false) + ret void +} + + +; ---------------------------------------------------------------------- +; Fully aligned cases + +define void @aligned_memcpy2(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy2: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lh a1, 0(a1) +; RV32-BOTH-NEXT: sh a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy2: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lh a1, 0(a1) +; RV64-BOTH-NEXT: sh a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 2, i1 false) + ret void +} + +define void @aligned_memcpy3(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy3: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lbu a2, 2(a1) +; RV32-BOTH-NEXT: sb a2, 2(a0) +; RV32-BOTH-NEXT: lh a1, 0(a1) +; RV32-BOTH-NEXT: sh a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy3: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lbu a2, 2(a1) +; RV64-BOTH-NEXT: sb a2, 2(a0) +; RV64-BOTH-NEXT: lh a1, 0(a1) +; RV64-BOTH-NEXT: sh a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 3, i1 false) + ret void +} + +define void @aligned_memcpy4(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy4: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: lw a1, 0(a1) +; RV32-BOTH-NEXT: sw a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy4: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lw a1, 0(a1) +; RV64-BOTH-NEXT: sw a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 4, i1 false) + ret void +} + +define void @aligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: aligned_memcpy7: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 6(a1) +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: lh a2, 4(a1) +; RV32-NEXT: sh a2, 4(a0) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: aligned_memcpy7: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 6(a1) +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: lh a2, 4(a1) +; RV64-NEXT: sh a2, 4(a0) +; RV64-NEXT: lw a1, 0(a1) +; RV64-NEXT: sw a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: aligned_memcpy7: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 3(a1) +; RV32-FAST-NEXT: sw a2, 3(a0) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: aligned_memcpy7: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lw a2, 3(a1) +; RV64-FAST-NEXT: sw a2, 3(a0) +; RV64-FAST-NEXT: lw a1, 0(a1) +; RV64-FAST-NEXT: sw a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 7, i1 false) + ret void +} + +define void @aligned_memcpy8(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy8: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-BOTH-NEXT: vle32.v v8, (a1) +; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy8: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: ld a1, 0(a1) +; RV64-BOTH-NEXT: sd a1, 0(a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 8, i1 false) + ret void +} + +define void @aligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: aligned_memcpy15: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 14(a1) +; RV32-NEXT: sb a2, 14(a0) +; RV32-NEXT: lh a2, 12(a1) +; RV32-NEXT: sh a2, 12(a0) +; RV32-NEXT: lw a2, 8(a1) +; RV32-NEXT: sw a2, 8(a0) +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vle32.v v8, (a1) +; RV32-NEXT: vse32.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: aligned_memcpy15: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 14(a1) +; RV64-NEXT: sb a2, 14(a0) +; RV64-NEXT: lh a2, 12(a1) +; RV64-NEXT: sh a2, 12(a0) +; RV64-NEXT: lw a2, 8(a1) +; RV64-NEXT: sw a2, 8(a0) +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: sd a1, 0(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: aligned_memcpy15: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 11(a1) +; RV32-FAST-NEXT: sw a2, 11(a0) +; RV32-FAST-NEXT: lw a2, 8(a1) +; RV32-FAST-NEXT: sw a2, 8(a0) +; RV32-FAST-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-FAST-NEXT: vle32.v v8, (a1) +; RV32-FAST-NEXT: vse32.v v8, (a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: aligned_memcpy15: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: ld a2, 7(a1) +; RV64-FAST-NEXT: sd a2, 7(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 15, i1 false) + ret void +} + +define void @aligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy16: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-BOTH-NEXT: vle32.v v8, (a1) +; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy16: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-BOTH-NEXT: vle64.v v8, (a1) +; RV64-BOTH-NEXT: vse64.v v8, (a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 16, i1 false) + ret void +} + +define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { +; RV32-LABEL: aligned_memcpy31: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 30(a1) +; RV32-NEXT: sb a2, 30(a0) +; RV32-NEXT: lh a2, 28(a1) +; RV32-NEXT: sh a2, 28(a0) +; RV32-NEXT: lw a2, 24(a1) +; RV32-NEXT: sw a2, 24(a0) +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vle32.v v8, (a1) +; RV32-NEXT: vse32.v v8, (a0) +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vle32.v v8, (a1) +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vse32.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: aligned_memcpy31: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 30(a1) +; RV64-NEXT: sb a2, 30(a0) +; RV64-NEXT: lh a2, 28(a1) +; RV64-NEXT: sh a2, 28(a0) +; RV64-NEXT: lw a2, 24(a1) +; RV64-NEXT: sw a2, 24(a0) +; RV64-NEXT: ld a2, 16(a1) +; RV64-NEXT: sd a2, 16(a0) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vle64.v v8, (a1) +; RV64-NEXT: vse64.v v8, (a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: aligned_memcpy31: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 27(a1) +; RV32-FAST-NEXT: sw a2, 27(a0) +; RV32-FAST-NEXT: lw a2, 24(a1) +; RV32-FAST-NEXT: sw a2, 24(a0) +; RV32-FAST-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-FAST-NEXT: vle32.v v8, (a1) +; RV32-FAST-NEXT: vse32.v v8, (a0) +; RV32-FAST-NEXT: addi a1, a1, 16 +; RV32-FAST-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-FAST-NEXT: vle32.v v8, (a1) +; RV32-FAST-NEXT: addi a0, a0, 16 +; RV32-FAST-NEXT: vse32.v v8, (a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: aligned_memcpy31: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: ld a2, 23(a1) +; RV64-FAST-NEXT: sd a2, 23(a0) +; RV64-FAST-NEXT: ld a2, 16(a1) +; RV64-FAST-NEXT: sd a2, 16(a0) +; RV64-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-FAST-NEXT: vle64.v v8, (a1) +; RV64-FAST-NEXT: vse64.v v8, (a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 31, i1 false) + ret void +} + +define void @aligned_memcpy32(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy32: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-BOTH-NEXT: vle32.v v8, (a1) +; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy32: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-BOTH-NEXT: vle64.v v8, (a1) +; RV64-BOTH-NEXT: vse64.v v8, (a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 32, i1 false) + ret void +} + +define void @aligned_memcpy64(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy64: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-BOTH-NEXT: vle32.v v8, (a1) +; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy64: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-BOTH-NEXT: vle64.v v8, (a1) +; RV64-BOTH-NEXT: vse64.v v8, (a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 64, i1 false) + ret void +} + +define void @aligned_memcpy96(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy96: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-BOTH-NEXT: vle32.v v8, (a1) +; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: addi a1, a1, 64 +; RV32-BOTH-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-BOTH-NEXT: vle32.v v8, (a1) +; RV32-BOTH-NEXT: addi a0, a0, 64 +; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy96: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-BOTH-NEXT: vle64.v v8, (a1) +; RV64-BOTH-NEXT: vse64.v v8, (a0) +; RV64-BOTH-NEXT: addi a1, a1, 64 +; RV64-BOTH-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-BOTH-NEXT: vle64.v v8, (a1) +; RV64-BOTH-NEXT: addi a0, a0, 64 +; RV64-BOTH-NEXT: vse64.v v8, (a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 96, i1 false) + ret void +} + +define void @aligned_memcpy128(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy128: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: li a2, 32 +; RV32-BOTH-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-BOTH-NEXT: vle32.v v8, (a1) +; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy128: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-BOTH-NEXT: vle64.v v8, (a1) +; RV64-BOTH-NEXT: vse64.v v8, (a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 128, i1 false) + ret void +} + +define void @aligned_memcpy196(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy196: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: li a2, 32 +; RV32-BOTH-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-BOTH-NEXT: vle32.v v8, (a1) +; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: lw a2, 192(a1) +; RV32-BOTH-NEXT: sw a2, 192(a0) +; RV32-BOTH-NEXT: addi a1, a1, 128 +; RV32-BOTH-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-BOTH-NEXT: vle32.v v8, (a1) +; RV32-BOTH-NEXT: addi a0, a0, 128 +; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy196: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: lw a2, 192(a1) +; RV64-BOTH-NEXT: sw a2, 192(a0) +; RV64-BOTH-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-BOTH-NEXT: vle64.v v8, (a1) +; RV64-BOTH-NEXT: vse64.v v8, (a0) +; RV64-BOTH-NEXT: addi a1, a1, 128 +; RV64-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-BOTH-NEXT: vle64.v v8, (a1) +; RV64-BOTH-NEXT: addi a0, a0, 128 +; RV64-BOTH-NEXT: vse64.v v8, (a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 196, i1 false) + ret void +} + +define void @aligned_memcpy256(ptr nocapture %dest, ptr %src) nounwind { +; RV32-BOTH-LABEL: aligned_memcpy256: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: li a2, 32 +; RV32-BOTH-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-BOTH-NEXT: vle32.v v8, (a1) +; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: addi a1, a1, 128 +; RV32-BOTH-NEXT: vle32.v v8, (a1) +; RV32-BOTH-NEXT: addi a0, a0, 128 +; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: aligned_memcpy256: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-BOTH-NEXT: vle64.v v8, (a1) +; RV64-BOTH-NEXT: vse64.v v8, (a0) +; RV64-BOTH-NEXT: addi a1, a1, 128 +; RV64-BOTH-NEXT: vle64.v v8, (a1) +; RV64-BOTH-NEXT: addi a0, a0, 128 +; RV64-BOTH-NEXT: vse64.v v8, (a0) +; RV64-BOTH-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 256, i1 false) + ret void +} + +; ------------------------------------------------------------------------ +; A few partially aligned cases + + +define void @memcpy16_align4(ptr nocapture %dest, ptr nocapture %src) nounwind { +; RV32-BOTH-LABEL: memcpy16_align4: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-BOTH-NEXT: vle32.v v8, (a1) +; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: ret +; +; RV64-LABEL: memcpy16_align4: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vle32.v v8, (a1) +; RV64-NEXT: vse32.v v8, (a0) +; RV64-NEXT: ret +; +; RV64-FAST-LABEL: memcpy16_align4: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: ld a2, 8(a1) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret +entry: + tail call void @llvm.memcpy.inline.p0.p0.i32(ptr align 4 %dest, ptr align 4 %src, i32 16, i1 false) + ret void +} + +define i32 @memcpy11_align8(ptr nocapture %dest, ptr %src) { +; RV32-LABEL: memcpy11_align8: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lbu a2, 10(a1) +; RV32-NEXT: sb a2, 10(a0) +; RV32-NEXT: lh a2, 8(a1) +; RV32-NEXT: sh a2, 8(a0) +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vle32.v v8, (a1) +; RV32-NEXT: vse32.v v8, (a0) +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret +; +; RV64-LABEL: memcpy11_align8: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lbu a2, 10(a1) +; RV64-NEXT: sb a2, 10(a0) +; RV64-NEXT: lh a2, 8(a1) +; RV64-NEXT: sh a2, 8(a0) +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: sd a1, 0(a0) +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: memcpy11_align8: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: lw a2, 7(a1) +; RV32-FAST-NEXT: sw a2, 7(a0) +; RV32-FAST-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-FAST-NEXT: vle32.v v8, (a1) +; RV32-FAST-NEXT: vse32.v v8, (a0) +; RV32-FAST-NEXT: li a0, 0 +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: memcpy11_align8: +; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: lw a2, 7(a1) +; RV64-FAST-NEXT: sw a2, 7(a0) +; RV64-FAST-NEXT: ld a1, 0(a1) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: li a0, 0 +; RV64-FAST-NEXT: ret +entry: + call void @llvm.memcpy.inline.p0.p0.i32(ptr align 8 %dest, ptr align 8 %src, i32 11, i1 false) + ret i32 0 +} + + +declare void @llvm.memcpy.inline.p0.p0.i32(ptr nocapture, ptr nocapture, i32, i1) nounwind +declare void @llvm.memcpy.inline.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind -- 2.7.4