From eeefb77c77920d66834bbced01c002604e5d4f66 Mon Sep 17 00:00:00 2001 From: Ruiling Song Date: Wed, 19 Mar 2014 11:41:54 +0800 Subject: [PATCH] GBE: make byte/short vload/vstore process one element each time. Per OCL Spec, the computed address (p+offset*n) is 8-bit aligned for char, and 16-bit aligned for short in vloadn & vstoren. That is we can not assume that vload4 with char pointer is 4byte aligned. The previous implementation will make Clang generate an load or store with alignment 4 which is in fact only alignment 1. We need find another way to optimize the vloadn. But before that, let's keep vloadn and vstoren work correctly. This could fix the regression issue caused by byte/short optimization. Signed-off-by: Ruiling Song Reviewed-by: Zhigang Gong --- backend/src/ocl_stdlib.tmpl.h | 60 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 4 deletions(-) diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h index e3ac632..25f2ff7 100755 --- a/backend/src/ocl_stdlib.tmpl.h +++ b/backend/src/ocl_stdlib.tmpl.h @@ -3882,10 +3882,59 @@ INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \ DECL_UNTYPED_RD_ALL_SPACE(TYPE, __constant) \ DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private) -DECL_UNTYPED_RW_ALL(char) -DECL_UNTYPED_RW_ALL(uchar) -DECL_UNTYPED_RW_ALL(short) -DECL_UNTYPED_RW_ALL(ushort) +#define DECL_BYTE_RD_SPACE(TYPE, SPACE) \ +INLINE_OVERLOADABLE TYPE##2 vload2(size_t offset, const SPACE TYPE *p) { \ + return (TYPE##2)(*(p+2*offset), *(p+2*offset+1)); \ +} \ +INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \ + return (TYPE##3)(*(p+3*offset), *(p+3*offset+1), *(p+3*offset+2)); \ +} \ +INLINE_OVERLOADABLE TYPE##4 vload4(size_t offset, const SPACE TYPE *p) { \ + return (TYPE##4)(vload2(2*offset, p), vload2(2*offset, p+2)); \ +} \ +INLINE_OVERLOADABLE TYPE##8 vload8(size_t offset, const SPACE TYPE *p) { \ + return (TYPE##8)(vload4(2*offset, p), vload4(2*offset, p+4)); \ +} \ +INLINE_OVERLOADABLE TYPE##16 vload16(size_t offset, const SPACE TYPE *p) { \ + return (TYPE##16)(vload8(2*offset, p), vload8(2*offset, p+8)); \ +} + +#define DECL_BYTE_WR_SPACE(TYPE, SPACE) \ +INLINE_OVERLOADABLE void vstore2(TYPE##2 v, size_t offset, SPACE TYPE *p) {\ + *(p + 2 * offset) = v.s0; \ + *(p + 2 * offset + 1) = v.s1; \ +} \ +INLINE_OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\ + *(p + 3 * offset) = v.s0; \ + *(p + 3 * offset + 1) = v.s1; \ + *(p + 3 * offset + 2) = v.s2; \ +} \ +INLINE_OVERLOADABLE void vstore4(TYPE##4 v, size_t offset, SPACE TYPE *p) { \ + vstore2(v.lo, 2*offset, p); \ + vstore2(v.hi, 2*offset, p+2); \ +} \ +INLINE_OVERLOADABLE void vstore8(TYPE##8 v, size_t offset, SPACE TYPE *p) { \ + vstore4(v.lo, 2*offset, p); \ + vstore4(v.hi, 2*offset, p+4); \ +} \ +INLINE_OVERLOADABLE void vstore16(TYPE##16 v, size_t offset, SPACE TYPE *p) { \ + vstore8(v.lo, 2*offset, p); \ + vstore8(v.hi, 2*offset, p+8); \ +} + +#define DECL_BYTE_RW_ALL(TYPE) \ + DECL_BYTE_RD_SPACE(TYPE, __global) \ + DECL_BYTE_RD_SPACE(TYPE, __local) \ + DECL_BYTE_RD_SPACE(TYPE, __private) \ + DECL_BYTE_RD_SPACE(TYPE, __constant) \ + DECL_BYTE_WR_SPACE(TYPE, __global) \ + DECL_BYTE_WR_SPACE(TYPE, __local) \ + DECL_BYTE_WR_SPACE(TYPE, __private) + +DECL_BYTE_RW_ALL(char) +DECL_BYTE_RW_ALL(uchar) +DECL_BYTE_RW_ALL(short) +DECL_BYTE_RW_ALL(ushort) DECL_UNTYPED_RW_ALL(int) DECL_UNTYPED_RW_ALL(uint) DECL_UNTYPED_RW_ALL(long) @@ -3900,6 +3949,9 @@ DECL_UNTYPED_RW_ALL(double) #undef DECL_UNTYPED_RD_SPACE_N #undef DECL_UNTYPED_V3_SPACE #undef DECL_UNTYPED_RDV3_SPACE +#undef DECL_BYTE_RD_SPACE +#undef DECL_BYTE_WR_SPACE +#undef DECL_BYTE_RW_ALL PURE CONST float __gen_ocl_f16to32(short h); PURE CONST short __gen_ocl_f32to16(float f); -- 2.7.4