From 73fd076760b67d4d56b7fbcbff34406eef2622a0 Mon Sep 17 00:00:00 2001 From: Yang Rong Date: Tue, 14 Oct 2014 11:48:20 +0800 Subject: [PATCH] Fix memcpy and memset bug. In ocl_memcpy.ll and ocl_memset.ll, index+4 should be less than size when use int in memcpy and memset, and need consider alignment. V3: For performance, provide two versions of memcpy and memset, decide call which one when lowering intrinsic. V4: add these new functions in the bitcode link filter list. Signed-off-by: Yang Rong Signed-off-by: Zhigang Gong --- backend/src/libocl/src/ocl_memcpy.ll | 256 ++++++++++++++++++++++++--- backend/src/libocl/src/ocl_memset.ll | 80 ++++++++- backend/src/llvm/llvm_bitcode_link.cpp | 14 ++ backend/src/llvm/llvm_intrinsic_lowering.cpp | 14 +- 4 files changed, 335 insertions(+), 29 deletions(-) diff --git a/backend/src/libocl/src/ocl_memcpy.ll b/backend/src/libocl/src/ocl_memcpy.ll index 476033e..fbc44d1 100644 --- a/backend/src/libocl/src/ocl_memcpy.ll +++ b/backend/src/libocl/src/ocl_memcpy.ll @@ -1,7 +1,7 @@ ;The memcpy's source code. -; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src, size_t size) { +; INLINE_OVERLOADABLE void __gen_memcpy_align(uchar* dst, uchar* src, size_t size) { ; size_t index = 0; -; while((index + 4) >= size) { +; while((index + 4) <= size) { ; *((uint *)(dst + index)) = *((uint *)(src + index)); ; index += 4; ; } @@ -11,14 +11,14 @@ ; } ; } -define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline { +define void @__gen_memcpy_gg_align(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline { entry: br label %while.cond while.cond: ; preds = %while.body, %entry %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] %add = add i32 %index.0, 4 - %cmp = icmp ult i32 %add, %size + %cmp = icmp ugt i32 %add, %size br i1 %cmp, label %while.cond3, label %while.body while.body: ; preds = %while.cond @@ -47,14 +47,14 @@ while.end7: ; preds = %while.cond3 ret void } -define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline { +define void @__gen_memcpy_gp_align(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline { entry: br label %while.cond while.cond: ; preds = %while.body, %entry %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] %add = add i32 %index.0, 4 - %cmp = icmp ult i32 %add, %size + %cmp = icmp ugt i32 %add, %size br i1 %cmp, label %while.cond3, label %while.body while.body: ; preds = %while.cond @@ -83,14 +83,14 @@ while.end7: ; preds = %while.cond3 ret void } -define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline { +define void @__gen_memcpy_gl_align(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline { entry: br label %while.cond while.cond: ; preds = %while.body, %entry %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] %add = add i32 %index.0, 4 - %cmp = icmp ult i32 %add, %size + %cmp = icmp ugt i32 %add, %size br i1 %cmp, label %while.cond3, label %while.body while.body: ; preds = %while.cond @@ -119,14 +119,14 @@ while.end7: ; preds = %while.cond3 ret void } -define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline { +define void @__gen_memcpy_pg_align(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline { entry: br label %while.cond while.cond: ; preds = %while.body, %entry %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] %add = add i32 %index.0, 4 - %cmp = icmp ult i32 %add, %size + %cmp = icmp ugt i32 %add, %size br i1 %cmp, label %while.cond3, label %while.body while.body: ; preds = %while.cond @@ -155,14 +155,14 @@ while.end7: ; preds = %while.cond3 ret void } -define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline { +define void @__gen_memcpy_pp_align(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline { entry: br label %while.cond while.cond: ; preds = %while.body, %entry %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] %add = add i32 %index.0, 4 - %cmp = icmp ult i32 %add, %size + %cmp = icmp ugt i32 %add, %size br i1 %cmp, label %while.cond3, label %while.body while.body: ; preds = %while.cond @@ -191,14 +191,14 @@ while.end7: ; preds = %while.cond3 ret void } -define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline { +define void @__gen_memcpy_pl_align(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline { entry: br label %while.cond while.cond: ; preds = %while.body, %entry %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] %add = add i32 %index.0, 4 - %cmp = icmp ult i32 %add, %size + %cmp = icmp ugt i32 %add, %size br i1 %cmp, label %while.cond3, label %while.body while.body: ; preds = %while.cond @@ -227,14 +227,14 @@ while.end7: ; preds = %while.cond3 ret void } -define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline { +define void @__gen_memcpy_lg_align(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline { entry: br label %while.cond while.cond: ; preds = %while.body, %entry %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] %add = add i32 %index.0, 4 - %cmp = icmp ult i32 %add, %size + %cmp = icmp ugt i32 %add, %size br i1 %cmp, label %while.cond3, label %while.body while.body: ; preds = %while.cond @@ -263,14 +263,14 @@ while.end7: ; preds = %while.cond3 ret void } -define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline { +define void @__gen_memcpy_lp_align(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline { entry: br label %while.cond while.cond: ; preds = %while.body, %entry %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] %add = add i32 %index.0, 4 - %cmp = icmp ult i32 %add, %size + %cmp = icmp ugt i32 %add, %size br i1 %cmp, label %while.cond3, label %while.body while.body: ; preds = %while.cond @@ -299,14 +299,14 @@ while.end7: ; preds = %while.cond3 ret void } -define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline { +define void @__gen_memcpy_ll_align(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline { entry: br label %while.cond while.cond: ; preds = %while.body, %entry %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] %add = add i32 %index.0, 4 - %cmp = icmp ult i32 %add, %size + %cmp = icmp ugt i32 %add, %size br i1 %cmp, label %while.cond3, label %while.body while.body: ; preds = %while.cond @@ -334,3 +334,219 @@ while.body5: ; preds = %while.cond3 while.end7: ; preds = %while.cond3 ret void } + +;The memcpy's source code. +; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src, size_t size) { +; size_t index = 0; +; while(index < size) { +; dst[index] = src[index]; +; index++; +; } +; } + +define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline { +entry: + %cmp4 = icmp eq i32 %size, 0 + br i1 %cmp4, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %0 = ptrtoint i8 addrspace(1)* %src to i32 + %1 = add i32 %0, %index.05 + %2 = inttoptr i32 %1 to i8 addrspace(1)* + %3 = load i8 addrspace(1)* %2, align 1 + %4 = ptrtoint i8 addrspace(1)* %dst to i32 + %5 = add i32 %4, %index.05 + %6 = inttoptr i32 %5 to i8 addrspace(1)* + store i8 %3, i8 addrspace(1)* %6, align 1 + %inc = add i32 %index.05, 1 + %cmp = icmp ult i32 %inc, %size + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline { +entry: + %cmp4 = icmp eq i32 %size, 0 + br i1 %cmp4, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %0 = ptrtoint i8 addrspace(0)* %src to i32 + %1 = add i32 %0, %index.05 + %2 = inttoptr i32 %1 to i8 addrspace(0)* + %3 = load i8 addrspace(0)* %2, align 1 + %4 = ptrtoint i8 addrspace(1)* %dst to i32 + %5 = add i32 %4, %index.05 + %6 = inttoptr i32 %5 to i8 addrspace(1)* + store i8 %3, i8 addrspace(1)* %6, align 1 + %inc = add i32 %index.05, 1 + %cmp = icmp ult i32 %inc, %size + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline { +entry: + %cmp4 = icmp eq i32 %size, 0 + br i1 %cmp4, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %0 = ptrtoint i8 addrspace(3)* %src to i32 + %1 = add i32 %0, %index.05 + %2 = inttoptr i32 %1 to i8 addrspace(3)* + %3 = load i8 addrspace(3)* %2, align 1 + %4 = ptrtoint i8 addrspace(1)* %dst to i32 + %5 = add i32 %4, %index.05 + %6 = inttoptr i32 %5 to i8 addrspace(1)* + store i8 %3, i8 addrspace(1)* %6, align 1 + %inc = add i32 %index.05, 1 + %cmp = icmp ult i32 %inc, %size + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline { +entry: + %cmp4 = icmp eq i32 %size, 0 + br i1 %cmp4, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %0 = ptrtoint i8 addrspace(1)* %src to i32 + %1 = add i32 %0, %index.05 + %2 = inttoptr i32 %1 to i8 addrspace(1)* + %3 = load i8 addrspace(1)* %2, align 1 + %4 = ptrtoint i8 addrspace(0)* %dst to i32 + %5 = add i32 %4, %index.05 + %6 = inttoptr i32 %5 to i8 addrspace(0)* + store i8 %3, i8 addrspace(0)* %6, align 1 + %inc = add i32 %index.05, 1 + %cmp = icmp ult i32 %inc, %size + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline { +entry: + %cmp4 = icmp eq i32 %size, 0 + br i1 %cmp4, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %0 = ptrtoint i8 addrspace(0)* %src to i32 + %1 = add i32 %0, %index.05 + %2 = inttoptr i32 %1 to i8 addrspace(0)* + %3 = load i8 addrspace(0)* %2, align 1 + %4 = ptrtoint i8 addrspace(0)* %dst to i32 + %5 = add i32 %4, %index.05 + %6 = inttoptr i32 %5 to i8 addrspace(0)* + store i8 %3, i8 addrspace(0)* %6, align 1 + %inc = add i32 %index.05, 1 + %cmp = icmp ult i32 %inc, %size + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline { +entry: + %cmp4 = icmp eq i32 %size, 0 + br i1 %cmp4, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %0 = ptrtoint i8 addrspace(3)* %src to i32 + %1 = add i32 %0, %index.05 + %2 = inttoptr i32 %1 to i8 addrspace(3)* + %3 = load i8 addrspace(3)* %2, align 1 + %4 = ptrtoint i8 addrspace(0)* %dst to i32 + %5 = add i32 %4, %index.05 + %6 = inttoptr i32 %5 to i8 addrspace(0)* + store i8 %3, i8 addrspace(0)* %6, align 1 + %inc = add i32 %index.05, 1 + %cmp = icmp ult i32 %inc, %size + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline { +entry: + %cmp4 = icmp eq i32 %size, 0 + br i1 %cmp4, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %0 = ptrtoint i8 addrspace(1)* %src to i32 + %1 = add i32 %0, %index.05 + %2 = inttoptr i32 %1 to i8 addrspace(1)* + %3 = load i8 addrspace(1)* %2, align 1 + %4 = ptrtoint i8 addrspace(3)* %dst to i32 + %5 = add i32 %4, %index.05 + %6 = inttoptr i32 %5 to i8 addrspace(3)* + store i8 %3, i8 addrspace(3)* %6, align 1 + %inc = add i32 %index.05, 1 + %cmp = icmp ult i32 %inc, %size + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline { +entry: + %cmp4 = icmp eq i32 %size, 0 + br i1 %cmp4, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %0 = ptrtoint i8 addrspace(0)* %src to i32 + %1 = add i32 %0, %index.05 + %2 = inttoptr i32 %1 to i8 addrspace(0)* + %3 = load i8 addrspace(0)* %2, align 1 + %4 = ptrtoint i8 addrspace(3)* %dst to i32 + %5 = add i32 %4, %index.05 + %6 = inttoptr i32 %5 to i8 addrspace(3)* + store i8 %3, i8 addrspace(3)* %6, align 1 + %inc = add i32 %index.05, 1 + %cmp = icmp ult i32 %inc, %size + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline { +entry: + %cmp4 = icmp eq i32 %size, 0 + br i1 %cmp4, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %0 = ptrtoint i8 addrspace(3)* %src to i32 + %1 = add i32 %0, %index.05 + %2 = inttoptr i32 %1 to i8 addrspace(3)* + %3 = load i8 addrspace(3)* %2, align 1 + %4 = ptrtoint i8 addrspace(3)* %dst to i32 + %5 = add i32 %4, %index.05 + %6 = inttoptr i32 %5 to i8 addrspace(3)* + store i8 %3, i8 addrspace(3)* %6, align 1 + %inc = add i32 %index.05, 1 + %cmp = icmp ult i32 %inc, %size + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} diff --git a/backend/src/libocl/src/ocl_memset.ll b/backend/src/libocl/src/ocl_memset.ll index addf9f5..665eac4 100644 --- a/backend/src/libocl/src/ocl_memset.ll +++ b/backend/src/libocl/src/ocl_memset.ll @@ -1,5 +1,5 @@ ;The memset's source code. -; INLINE_OVERLOADABLE void __gen_memset(uchar* dst, uchar val, size_t size) { +; INLINE_OVERLOADABLE void __gen_memset_align(uchar* dst, uchar val, size_t size) { ; size_t index = 0; ; uint v = (val << 24) | (val << 16) | (val << 8) | val; ; while((index + 4) >= size) { @@ -12,7 +12,7 @@ ; } ; } -define void @__gen_memset_p(i8* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline { +define void @__gen_memset_p_align(i8* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline { entry: %conv = zext i8 %val to i32 %shl = shl nuw i32 %conv, 24 @@ -26,7 +26,7 @@ entry: while.cond: ; preds = %while.body, %entry %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] %add = add i32 %index.0, 4 - %cmp = icmp ult i32 %add, %size + %cmp = icmp ugt i32 %add, %size br i1 %cmp, label %while.cond10, label %while.body while.body: ; preds = %while.cond @@ -50,7 +50,7 @@ while.end14: ; preds = %while.cond10 ret void } -define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline { +define void @__gen_memset_g_align(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline { entry: %conv = zext i8 %val to i32 %shl = shl nuw i32 %conv, 24 @@ -64,7 +64,7 @@ entry: while.cond: ; preds = %while.body, %entry %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] %add = add i32 %index.0, 4 - %cmp = icmp ult i32 %add, %size + %cmp = icmp ugt i32 %add, %size br i1 %cmp, label %while.cond10, label %while.body while.body: ; preds = %while.cond @@ -88,7 +88,7 @@ while.end14: ; preds = %while.cond10 ret void } -define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline { +define void @__gen_memset_l_align(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline { entry: %conv = zext i8 %val to i32 %shl = shl nuw i32 %conv, 24 @@ -102,7 +102,7 @@ entry: while.cond: ; preds = %while.body, %entry %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] %add = add i32 %index.0, 4 - %cmp = icmp ult i32 %add, %size + %cmp = icmp ugt i32 %add, %size br i1 %cmp, label %while.cond10, label %while.body while.body: ; preds = %while.cond @@ -125,3 +125,69 @@ while.body13: ; preds = %while.cond10 while.end14: ; preds = %while.cond10 ret void } + +;The memset's source code. +; INLINE_OVERLOADABLE void __gen_memset(uchar* dst, uchar val, size_t size) { +; size_t index = 0; +; while(index < size) { +; dst[index] = val; +; index++; +; } +; } + +define void @__gen_memset_p(i8 addrspace(0)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline { +entry: + %cmp3 = icmp eq i32 %size, 0 + br i1 %cmp3, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %index.04 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %0 = ptrtoint i8 addrspace(0)* %dst to i32 + %1 = add i32 %0, %index.04 + %2 = inttoptr i32 %1 to i8 addrspace(0)* + store i8 %val, i8 addrspace(0)* %2, align 1 + %inc = add i32 %index.04, 1 + %cmp = icmp ult i32 %inc, %size + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline { +entry: + %cmp3 = icmp eq i32 %size, 0 + br i1 %cmp3, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %index.04 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %0 = ptrtoint i8 addrspace(1)* %dst to i32 + %1 = add i32 %0, %index.04 + %2 = inttoptr i32 %1 to i8 addrspace(1)* + store i8 %val, i8 addrspace(1)* %2, align 1 + %inc = add i32 %index.04, 1 + %cmp = icmp ult i32 %inc, %size + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} + +define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline { +entry: + %cmp3 = icmp eq i32 %size, 0 + br i1 %cmp3, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %index.04 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %0 = ptrtoint i8 addrspace(3)* %dst to i32 + %1 = add i32 %0, %index.04 + %2 = inttoptr i32 %1 to i8 addrspace(3)* + store i8 %val, i8 addrspace(3)* %2, align 1 + %inc = add i32 %index.04, 1 + %cmp = icmp ult i32 %inc, %size + br i1 %cmp, label %while.body, label %while.end + +while.end: ; preds = %while.body, %entry + ret void +} diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp index 7ef6a8a..fa09703 100644 --- a/backend/src/llvm/llvm_bitcode_link.cpp +++ b/backend/src/llvm/llvm_bitcode_link.cpp @@ -157,6 +157,20 @@ namespace gbe builtinFuncs.push_back("__gen_memset_g"); builtinFuncs.push_back("__gen_memset_l"); + builtinFuncs.push_back("__gen_memcpy_gg_align"); + builtinFuncs.push_back("__gen_memcpy_gp_align"); + builtinFuncs.push_back("__gen_memcpy_gl_align"); + builtinFuncs.push_back("__gen_memcpy_pg_align"); + builtinFuncs.push_back("__gen_memcpy_pp_align"); + builtinFuncs.push_back("__gen_memcpy_pl_align"); + builtinFuncs.push_back("__gen_memcpy_lg_align"); + builtinFuncs.push_back("__gen_memcpy_lp_align"); + builtinFuncs.push_back("__gen_memcpy_ll_align"); + builtinFuncs.push_back("__gen_memset_p_align"); + builtinFuncs.push_back("__gen_memset_g_align"); + builtinFuncs.push_back("__gen_memset_l_align"); + + for (Module::iterator SF = mod->begin(), E = mod->end(); SF != E; ++SF) { if (SF->isDeclaration()) continue; if (!isKernelFunction(*SF)) continue; diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp b/backend/src/llvm/llvm_intrinsic_lowering.cpp index 7d04318..cfb18ab 100644 --- a/backend/src/llvm/llvm_intrinsic_lowering.cpp +++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp @@ -126,13 +126,18 @@ namespace gbe { Type *IntPtr = TD.getIntPtrType(Context); Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr, /* isSigned */ false); + Value *align = Builder.CreateIntCast(CI->getArgOperand(3), IntPtr, + /* isSigned */ false); + ConstantInt *ci = dyn_cast(align); Value *Ops[3]; Ops[0] = CI->getArgOperand(0); Ops[1] = CI->getArgOperand(1); Ops[2] = Size; - char name[16] = "__gen_memcpy_xx"; + char name[24] = "__gen_memcpy_xx"; name[13] = convertSpaceToName(Ops[0]); name[14] = convertSpaceToName(Ops[1]); + if(ci && (ci->getZExtValue() % 4 == 0)) //alignment is constant and 4 byte align + strcat(name, "_align"); replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context)); break; } @@ -143,13 +148,18 @@ namespace gbe { Type *IntPtr = TD.getIntPtrType(Op0->getType()); Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr, /* isSigned */ false); + Value *align = Builder.CreateIntCast(CI->getArgOperand(3), IntPtr, + /* isSigned */ false); + ConstantInt *ci = dyn_cast(align); Value *Ops[3]; Ops[0] = Op0; // Extend the amount to i32. Ops[1] = val; Ops[2] = Size; - char name[16] = "__gen_memset_x"; + char name[24] = "__gen_memset_x"; name[13] = convertSpaceToName(Ops[0]); + if(ci && (ci->getZExtValue() % 4 == 0)) //alignment is constant and 4 byte align + strcat(name, "_align"); replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context)); break; } -- 2.7.4