From: Benjamin Segovia Date: Tue, 6 Nov 2012 04:39:43 +0000 (-0800) Subject: Added three kernels from Inigo website. Be aware that glsl mod functions is not X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=59ceed088697a57dc086a7d268a44a546953582e;p=contrib%2Fbeignet.git Added three kernels from Inigo website. Be aware that glsl mod functions is not the same as ocl one... Damn :-) --- diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index 3965843..28e4bc6 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -492,6 +492,9 @@ namespace gbe if (dyn_cast(CPV)) CPV = extractConstantElem(CPV, index); + if (dyn_cast(CPV)) + return doIt(uint32_t(0)); + // Integers if (ConstantInt *CI = dyn_cast(CPV)) { Type* Ty = CI->getType(); @@ -517,7 +520,8 @@ namespace gbe } // Floats and doubles - switch (CPV->getType()->getTypeID()) { + const Type::TypeID typeID = CPV->getType()->getTypeID(); + switch (typeID) { case Type::FloatTyID: case Type::DoubleTyID: { @@ -535,6 +539,7 @@ namespace gbe break; default: GBE_ASSERTM(false, "Unsupported constant type"); + break; } const uint64_t imm(8); return doIt(imm); diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h index eda7247..19b45d1 100644 --- a/backend/src/ocl_stdlib.h +++ b/backend/src/ocl_stdlib.h @@ -123,12 +123,12 @@ INLINE OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); } INLINE OVERLOADABLE float native_tan(float x) { return native_sin(x) / native_cos(x); } +#define E 2.71828182845904523536f +INLINE OVERLOADABLE float native_exp(float x) { return native_powr(E, x); } +#undef E // TODO make them actually compliant precision-wise -#define cos native_cos // XXX work-around ptx profile: cos already defined -#define sin native_sin // XXX work-around ptr profile: sin already defined #define sqrt native_sqrt // XXX work-around ptr profile: sin already defined - INLINE OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); } INLINE OVERLOADABLE float fabs(float x) { return __gen_ocl_fabs(x); } INLINE OVERLOADABLE float trunc(float x) { return __gen_ocl_rndz(x); } @@ -136,22 +136,30 @@ INLINE OVERLOADABLE float round(float x) { return __gen_ocl_rnde(x); } INLINE OVERLOADABLE float floor(float x) { return __gen_ocl_rndd(x); } INLINE OVERLOADABLE float ceil(float x) { return __gen_ocl_rndu(x); } INLINE OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); } +INLINE OVERLOADABLE float exp(float x, float y) { return native_exp(x); } INLINE OVERLOADABLE float fmod(float x, float y) { return x-y*trunc(x/y); } -// Hack pow is already a builtin +// TODO use llvm intrinsics definitions +#define cos native_cos +#define sin native_sin #define pow powr PURE CONST OVERLOADABLE float mad(float a, float b, float c); -OVERLOADABLE INLINE uint select(uint src0, uint src1, uint cond) { + +INLINE OVERLOADABLE uint select(uint src0, uint src1, uint cond) { + return cond ? src1 : src0; +} +INLINE OVERLOADABLE int select(int src0, int src1, int cond) { return cond ? src1 : src0; } -OVERLOADABLE INLINE int select(int src0, int src1, int cond) { +INLINE OVERLOADABLE float select(float src0, float src1, int cond) { return cond ? src1 : src0; } + // This will be optimized out by LLVM and will output LLVM select instructions #define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \ -OVERLOADABLE INLINE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \ +INLINE OVERLOADABLE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \ TYPE4 dst; \ const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \ const TYPE x1 = src1.x; \ @@ -171,19 +179,6 @@ DECL_SELECT4(int4, int, int4, 0x80000000) DECL_SELECT4(float4, float, int4, 0x80000000) #undef DECL_SELECT4 -#if 0 -INLINE OVERLOADABLE float2 mad(float2 a, float2 b, float2 c) { - return (float2)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y)); -} -INLINE OVERLOADABLE float3 mad(float3 a, float3 b, float3 c) { - return (float3)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z)); -} -INLINE OVERLOADABLE float4 mad(float4 a, float4 b, float4 c) { - return (float4)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), - mad(a.z,b.z,c.z), mad(a.w,b.w,c.w)); -} -#endif - ///////////////////////////////////////////////////////////////////////////// // Common Functions (see 6.11.4 of OCL 1.1 spec) ///////////////////////////////////////////////////////////////////////////// @@ -203,6 +198,8 @@ DECL_MIN_MAX(unsigned short) DECL_MIN_MAX(unsigned char) #undef DECL_MIN_MAX +INLINE OVERLOADABLE float fmax(float a, float b) { return max(a,b); } +INLINE OVERLOADABLE float fmin(float a, float b) { return min(a,b); } INLINE OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;} ///////////////////////////////////////////////////////////////////////////// @@ -411,68 +408,12 @@ INLINE OVERLOADABLE float8 mix(float8 x, float8 y, float a) { return mix(x,y,(fl INLINE OVERLOADABLE float16 mix(float16 x, float16 y, float a) { return mix(x,y,(float16)(a));} ///////////////////////////////////////////////////////////////////////////// -// Extensions to manipulate the register file -///////////////////////////////////////////////////////////////////////////// - -// Direct addressing register regions -OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int); -OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int); -OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int); -OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int); -OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int, int); -OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int, int, int); -OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int, int, int, int); -OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int, int, int, int, int); - -// Gather from register file -OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int); -OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int); -OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int); -OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int); -OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int, int); -OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int, int, int); -OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int, int, int, int); -OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int, int, int, int, int); - -///////////////////////////////////////////////////////////////////////////// -// Extension to have uniform condition per hardware thread -///////////////////////////////////////////////////////////////////////////// - -OVERLOADABLE unsigned short __gen_ocl_any(unsigned short cond); -OVERLOADABLE unsigned short __gen_ocl_all(unsigned short cond); - -///////////////////////////////////////////////////////////////////////////// -// Extension to support OBlock reads / writes -///////////////////////////////////////////////////////////////////////////// - -OVERLOADABLE int __gen_ocl_obread(const __global void *address); -OVERLOADABLE int __gen_ocl_obread(const __constant void *address); -OVERLOADABLE int __gen_ocl_obread(const __local void *address); -OVERLOADABLE void __gen_ocl_obwrite(const __global void *address, int); -OVERLOADABLE void __gen_ocl_obwrite(const __local void *address, int); - -///////////////////////////////////////////////////////////////////////////// // Force the compilation to SIMD8 or SIMD16 ///////////////////////////////////////////////////////////////////////////// int __gen_ocl_force_simd8(void); int __gen_ocl_force_simd16(void); -#define DECL_VOTE(TYPE) \ -INLINE OVERLOADABLE TYPE __gen_ocl_any(TYPE cond) { \ - return (TYPE) __gen_ocl_any((unsigned short) cond); \ -} \ -INLINE OVERLOADABLE TYPE __gen_ocl_all(TYPE cond) { \ - return (TYPE) __gen_ocl_all((unsigned short) cond); \ -} -DECL_VOTE(unsigned int) -DECL_VOTE(unsigned char) -DECL_VOTE(int) -DECL_VOTE(char) -DECL_VOTE(short) -DECL_VOTE(bool) -#undef DECL_VOTE - #define NULL ((void*)0) #undef PURE #undef CONST diff --git a/backend/src/ocl_stdlib_str.cpp b/backend/src/ocl_stdlib_str.cpp index b47e7ef..d51af41 100644 --- a/backend/src/ocl_stdlib_str.cpp +++ b/backend/src/ocl_stdlib_str.cpp @@ -126,12 +126,12 @@ std::string ocl_stdlib_str = "INLINE OVERLOADABLE float native_tan(float x) {\n" " return native_sin(x) / native_cos(x);\n" "}\n" +"#define E 2.71828182845904523536f\n" +"INLINE OVERLOADABLE float native_exp(float x) { return native_powr(E, x); }\n" +"#undef E\n" "\n" "// TODO make them actually compliant precision-wise\n" -"#define cos native_cos // XXX work-around ptx profile: cos already defined\n" -"#define sin native_sin // XXX work-around ptr profile: sin already defined\n" "#define sqrt native_sqrt // XXX work-around ptr profile: sin already defined\n" -"\n" "INLINE OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }\n" "INLINE OVERLOADABLE float fabs(float x) { return __gen_ocl_fabs(x); }\n" "INLINE OVERLOADABLE float trunc(float x) { return __gen_ocl_rndz(x); }\n" @@ -139,22 +139,30 @@ std::string ocl_stdlib_str = "INLINE OVERLOADABLE float floor(float x) { return __gen_ocl_rndd(x); }\n" "INLINE OVERLOADABLE float ceil(float x) { return __gen_ocl_rndu(x); }\n" "INLINE OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }\n" +"INLINE OVERLOADABLE float exp(float x, float y) { return native_exp(x); }\n" "INLINE OVERLOADABLE float fmod(float x, float y) { return x-y*trunc(x/y); }\n" "\n" -"// Hack pow is already a builtin\n" +"// TODO use llvm intrinsics definitions\n" +"#define cos native_cos\n" +"#define sin native_sin\n" "#define pow powr\n" "\n" "PURE CONST OVERLOADABLE float mad(float a, float b, float c);\n" -"OVERLOADABLE INLINE uint select(uint src0, uint src1, uint cond) {\n" +"\n" +"INLINE OVERLOADABLE uint select(uint src0, uint src1, uint cond) {\n" +" return cond ? src1 : src0;\n" +"}\n" +"INLINE OVERLOADABLE int select(int src0, int src1, int cond) {\n" " return cond ? src1 : src0;\n" "}\n" -"OVERLOADABLE INLINE int select(int src0, int src1, int cond) {\n" +"INLINE OVERLOADABLE float select(float src0, float src1, int cond) {\n" " return cond ? src1 : src0;\n" "}\n" "\n" +"\n" "// This will be optimized out by LLVM and will output LLVM select instructions\n" "#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \\\n" -"OVERLOADABLE INLINE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \\\n" +"INLINE OVERLOADABLE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \\\n" " TYPE4 dst; \\\n" " const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \\\n" " const TYPE x1 = src1.x; \\\n" @@ -174,19 +182,6 @@ std::string ocl_stdlib_str = "DECL_SELECT4(float4, float, int4, 0x80000000)\n" "#undef DECL_SELECT4\n" "\n" -"#if 0\n" -"INLINE OVERLOADABLE float2 mad(float2 a, float2 b, float2 c) {\n" -" return (float2)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y));\n" -"}\n" -"INLINE OVERLOADABLE float3 mad(float3 a, float3 b, float3 c) {\n" -" return (float3)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z));\n" -"}\n" -"INLINE OVERLOADABLE float4 mad(float4 a, float4 b, float4 c) {\n" -" return (float4)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y),\n" -" mad(a.z,b.z,c.z), mad(a.w,b.w,c.w));\n" -"}\n" -"#endif\n" -"\n" "/////////////////////////////////////////////////////////////////////////////\n" "// Common Functions (see 6.11.4 of OCL 1.1 spec)\n" "/////////////////////////////////////////////////////////////////////////////\n" @@ -206,6 +201,8 @@ std::string ocl_stdlib_str = "DECL_MIN_MAX(unsigned char)\n" "#undef DECL_MIN_MAX\n" "\n" +"INLINE OVERLOADABLE float fmax(float a, float b) { return max(a,b); }\n" +"INLINE OVERLOADABLE float fmin(float a, float b) { return min(a,b); }\n" "INLINE OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}\n" "\n" "/////////////////////////////////////////////////////////////////////////////\n" @@ -414,68 +411,12 @@ std::string ocl_stdlib_str = "INLINE OVERLOADABLE float16 mix(float16 x, float16 y, float a) { return mix(x,y,(float16)(a));}\n" "\n" "/////////////////////////////////////////////////////////////////////////////\n" -"// Extensions to manipulate the register file\n" -"/////////////////////////////////////////////////////////////////////////////\n" -"\n" -"// Direct addressing register regions\n" -"OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int);\n" -"OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int);\n" -"OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int);\n" -"OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int);\n" -"OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int, int);\n" -"OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int, int, int);\n" -"OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int, int, int, int);\n" -"OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int, int, int, int, int);\n" -"\n" -"// Gather from register file\n" -"OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int);\n" -"OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int);\n" -"OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int);\n" -"OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int);\n" -"OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int, int);\n" -"OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int, int, int);\n" -"OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int, int, int, int);\n" -"OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int, int, int, int, int);\n" -"\n" -"/////////////////////////////////////////////////////////////////////////////\n" -"// Extension to have uniform condition per hardware thread\n" -"/////////////////////////////////////////////////////////////////////////////\n" -"\n" -"OVERLOADABLE unsigned short __gen_ocl_any(unsigned short cond);\n" -"OVERLOADABLE unsigned short __gen_ocl_all(unsigned short cond);\n" -"\n" -"/////////////////////////////////////////////////////////////////////////////\n" -"// Extension to support OBlock reads / writes\n" -"/////////////////////////////////////////////////////////////////////////////\n" -"\n" -"OVERLOADABLE int __gen_ocl_obread(const __global void *address);\n" -"OVERLOADABLE int __gen_ocl_obread(const __constant void *address);\n" -"OVERLOADABLE int __gen_ocl_obread(const __local void *address);\n" -"OVERLOADABLE void __gen_ocl_obwrite(const __global void *address, int);\n" -"OVERLOADABLE void __gen_ocl_obwrite(const __local void *address, int);\n" -"\n" -"/////////////////////////////////////////////////////////////////////////////\n" "// Force the compilation to SIMD8 or SIMD16\n" "/////////////////////////////////////////////////////////////////////////////\n" "\n" "int __gen_ocl_force_simd8(void);\n" "int __gen_ocl_force_simd16(void);\n" "\n" -"#define DECL_VOTE(TYPE) \\\n" -"INLINE OVERLOADABLE TYPE __gen_ocl_any(TYPE cond) { \\\n" -" return (TYPE) __gen_ocl_any((unsigned short) cond); \\\n" -"} \\\n" -"INLINE OVERLOADABLE TYPE __gen_ocl_all(TYPE cond) { \\\n" -" return (TYPE) __gen_ocl_all((unsigned short) cond); \\\n" -"}\n" -"DECL_VOTE(unsigned int)\n" -"DECL_VOTE(unsigned char)\n" -"DECL_VOTE(int)\n" -"DECL_VOTE(char)\n" -"DECL_VOTE(short)\n" -"DECL_VOTE(bool)\n" -"#undef DECL_VOTE\n" -"\n" "#define NULL ((void*)0)\n" "#undef PURE\n" "#undef CONST\n" diff --git a/kernels/compiler_chocolux.cl b/kernels/compiler_chocolux.cl new file mode 100644 index 0000000..e3bbd64 --- /dev/null +++ b/kernels/compiler_chocolux.cl @@ -0,0 +1,65 @@ +typedef float2 vec2; +typedef float3 vec3; +typedef float4 vec4; + +#define sin native_sin +#define cos native_cos +#define tan native_tan +#define exp native_exp +#define normalize fast_normalize +#define length fast_length +#define mod fmod +#define time 10.f + +inline vec3 reflect(vec3 I, vec3 N) { + return I - 2.0f * dot(N, I) * N; +} + +inline uint pack_fp4(float4 u4) { + uint u; + u = (((uint) u4.x)) | + (((uint) u4.y) << 8) | + (((uint) u4.z) << 16); + return u; +} + +#define OUTPUT do {\ + const vec4 final = 255.f * max(min(gl_FragColor, (vec4)(1.f)), (vec4)(0.f)); \ + dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final); \ +} while (0) + +__kernel void compiler_chocolux(__global uint *dst, float resx, float resy, int w) +{ + vec2 gl_FragCoord = (vec2)(get_global_id(0), get_global_id(1)); + vec3 s[4]; + s[0]=(vec3)(0); + s[3]=(vec3)(sin(time),cos(time),0); + s[1]=s[3].zxy; + s[2]=s[3].zzx; + + float t,b,c,h=0.0f; + vec3 m,n; + vec3 p=(vec3)(.2f); + vec3 d=normalize(.001f*(vec3)(gl_FragCoord,.0f)-p); + + for(int i=0;i<4;i++) + { + t=2.0f; + for(int i=0;i<4;i++) + { + b=dot(d,n=s[i]-p); + c=b*b+.2f-dot(n,n); + if(b-c0.0f) + { + m=s[i];t=b-c; + } + } + p+=t*d; + d=reflect(d,n=normalize(p-m)); + h+=pow(n.x*n.x,44.f)+n.x*n.x*.2f; + } + vec4 gl_FragColor=(vec4)(h,h*h,h*h*h*h,1.f); + OUTPUT; +} + diff --git a/kernels/compiler_chocolux_ref.bmp b/kernels/compiler_chocolux_ref.bmp new file mode 100644 index 0000000..e51a4a7 Binary files /dev/null and b/kernels/compiler_chocolux_ref.bmp differ diff --git a/kernels/compiler_menger_sponge.cl b/kernels/compiler_menger_sponge.cl new file mode 100644 index 0000000..1827f68 --- /dev/null +++ b/kernels/compiler_menger_sponge.cl @@ -0,0 +1,188 @@ +// See http://www.iquilezles.org/articles/menger/menger.htm for the +// full explanation of how this was done + +typedef float2 vec2; +typedef float3 vec3; +typedef float4 vec4; + +#define sin native_sin +#define cos native_cos +#define tan native_tan +#define exp native_exp +#define normalize fast_normalize +#define length fast_length +#define mod fmod +#define time 1.f + +// fmod is not like glsl mod! +__attribute__((always_inline, overloadable)) +float glsl_mod(float x,float y) { return x-y*floor(x/y); } +__attribute__((always_inline, overloadable)) +float2 glsl_mod(float2 a,float2 b) { return (float2)(glsl_mod(a.x,b.x), glsl_mod(a.y,b.y)); } +__attribute__((always_inline, overloadable)) +float3 glsl_mod(float3 a,float3 b) { return (float3)(glsl_mod(a.x,b.x), glsl_mod(a.y,b.y), glsl_mod(a.z,b.z)); } + +inline vec3 reflect(vec3 I, vec3 N) { + return I - 2.0f * dot(N, I) * N; +} + +inline float clamp(x,m,M) { return max(min(x,M),m); } + +inline uint pack_fp4(float4 u4) { + uint u; + u = (((uint) u4.x)) | + (((uint) u4.y) << 8) | + (((uint) u4.z) << 16); + return u; +} + +#define OUTPUT do {\ + const vec4 final = 255.f * max(min(gl_FragColor, (vec4)(1.f)), (vec4)(0.f)); \ + dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final); \ +} while (0) + +__attribute__((always_inline)) +float maxcomp(vec3 p) { return max(p.x,max(p.y,p.z));} + +__attribute__((always_inline)) +float sdBox(vec3 p, vec3 b) +{ + vec3 di = fabs(p) - b; + float mc = maxcomp(di); + return min(mc,length(max(di,0.0f))); +} + +__attribute__((always_inline)) +vec4 map(vec3 p) +{ + float d = sdBox(p,(vec3)(1.0f)); + float4 res = (vec4)(d,1.f,0.f,0.f); + + float s = 1.0f; + for( int m=0; m<3; m++ ) + { + vec3 a = glsl_mod(p*s, 2.0f)-1.0f; + s *= 3.0f; + float rx = fabs(1.0f - 3.0f*fabs(a.x)); + float ry = fabs(1.0f - 3.0f*fabs(a.y)); + float rz = fabs(1.0f - 3.0f*fabs(a.z)); + + float da = max(rx,ry); + float db = max(ry,rz); + float dc = max(rz,rx); + float c = (min(da,min(db,dc))-1.0f)/s; + if (c > d) + { + d = c; + res = (vec4)(d, 0.2f*da*db*dc, (1.0f+(float)(m))/4.0f, 0.0f); + } + } + return (vec4)(res.x,res.y,res.z,0.f); +} + +// GLSL ES doesn't seem to like loops with conditional break/return... +#if 1 +__attribute__((always_inline)) +vec4 intersect( vec3 ro, vec3 rd ) +{ + float t = 0.0f; + for(int i=0;i<64;i++) + { + vec4 h = map(ro + rd*t); + if( h.x<0.002f ) + return (vec4)(t,h.yzw); + t += h.x; + } + return (vec4)(-1.0f); +} +#else +__attribute__((always_inline)) +vec4 intersect( vec3 ro, vec3 rd ) +{ + float t = 0.0f; + vec4 res = (vec4)(-1.0f); + for(int i=0;i<64;i++) + { + vec4 h = map(ro + rd*t); + if (h.x<0.002f) + { + if(res.x<0.0f) res = (vec4)(t,h.yzw); + } + t += h.x; + } + return res; +} +#endif + +__attribute__((always_inline)) +vec3 calcNormal(vec3 pos) +{ + vec3 eps = (vec3)(.001f,0.0f,0.0f); + vec3 nor; + nor.x = map(pos+eps.xyy).x - map(pos-eps.xyy).x; + nor.y = map(pos+eps.yxy).x - map(pos-eps.yxy).x; + nor.z = map(pos+eps.yyx).x - map(pos-eps.yyx).x; + return normalize(nor); +} + +__kernel void compiler_menger_sponge(__global uint *dst, float resx, float resy, int w) +{ + vec2 gl_FragCoord = (vec2)(get_global_id(0), get_global_id(1)); + vec2 p=-1.0f+2.0f*gl_FragCoord.xy/(vec2)(resx,resy); + + // light + vec3 light = normalize((vec3)(1.0f,0.8f,-0.6f)); + + float ctime = time; + // camera + vec3 ro = 1.1f*(vec3)(2.5f*cos(0.5f*ctime),1.5f*cos(ctime*.23f),2.5f*sin(0.5f*ctime)); + vec3 ww = normalize((vec3)(0.0f) - ro); + vec3 uu = normalize(cross( (vec3)(0.0f,1.0f,0.0f), ww )); + vec3 vv = normalize(cross(ww,uu)); + vec3 rd = normalize( p.x*uu + p.y*vv + 1.5f*ww ); + vec3 col = (vec3)(0.0f); + vec4 tmat = intersect(ro,rd); + +#if 0 + if( tmat.x>0.0 ) + col = (vec3)( + 0.6f+0.4f*cos(5.0f+6.2831f*tmat.z), + 0.6f+0.4f*cos(5.4f+6.2831f*tmat.z), + 0.6f+0.4f*cos(5.7f+6.2831f*tmat.z) ); + +#else + if( tmat.x>0.0f ) + { + vec3 pos = ro + tmat.x*rd; + vec3 nor = calcNormal(pos); + + float dif1 = max(0.4f + 0.6f*dot(nor,light),0.0f); + float dif2 = max(0.4f + 0.6f*dot(nor,(vec3)(-light.x,light.y,-light.z)),0.0f); + + // shadow + float ldis = 4.0f; + vec4 shadow = intersect( pos + light*ldis, -light ); + if( shadow.x>0.0f && shadow.x<(ldis-0.01f) ) dif1=0.0f; + + float ao = tmat.y; + col = 1.0f*ao*(vec3) (0.2f,0.2f,0.2f); + col += 2.0f*(0.5f+0.5f*ao)*dif1*(vec3)(1.0f,0.97f,0.85f); + col += 0.2f*(0.5f+0.5f*ao)*dif2*(vec3)(1.0f,0.97f,0.85f); + col += 1.0f*(0.5f+0.5f*ao)*(0.5f+0.5f*nor.y)*(vec3)(0.1f,0.15f,0.2f); + + // gamma lighting + col = col*0.5f+0.5f*sqrt(col)*1.2f; + + vec3 matcol = (vec3)( + 0.6f+0.4f*cos(5.0f+6.2831f*tmat.z), + 0.6f+0.4f*cos(5.4f+6.2831f*tmat.z), + 0.6f+0.4f*cos(5.7f+6.2831f*tmat.z) ); + col *= matcol; + col *= 1.5f*exp(-0.5f*tmat.x); + } +#endif + + vec4 gl_FragColor = (vec4)(col,1.0f); + OUTPUT; +} + diff --git a/kernels/compiler_menger_sponge_ref.bmp b/kernels/compiler_menger_sponge_ref.bmp new file mode 100644 index 0000000..911289f Binary files /dev/null and b/kernels/compiler_menger_sponge_ref.bmp differ diff --git a/kernels/compiler_nautilus.cl b/kernels/compiler_nautilus.cl new file mode 100644 index 0000000..90e44ef --- /dev/null +++ b/kernels/compiler_nautilus.cl @@ -0,0 +1,69 @@ +typedef float2 vec2; +typedef float3 vec3; +typedef float4 vec4; + +#define sin native_sin +#define cos native_cos +#define tan native_tan +#define exp native_exp +#define normalize fast_normalize +#define length fast_length +#define mod fmod +#define time 1.f + +inline vec3 reflect(vec3 I, vec3 N) { + return I - 2.0f * dot(N, I) * N; +} + +inline float clamp(x,m,M) { return max(min(x,M),m); } + +inline uint pack_fp4(float4 u4) { + uint u; + u = (((uint) u4.x)) | + (((uint) u4.y) << 8) | + (((uint) u4.z) << 16); + return u; +} + +#define OUTPUT do {\ + const vec4 final = 255.f * max(min(gl_FragColor, (vec4)(1.f)), (vec4)(0.f)); \ + dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final); \ +} while (0) + +inline float e(vec3 c) +{ + c=cos((vec3)(cos(c.x+time/6.0f)*c.x-cos(c.y*3.0f+time/5.0f)*c.y, + cos(time/4.0f)*c.z/3.0f*c.x-cos(time/7.0f)*c.y, + c.x+c.y+c.z+time)); + return dot(c*c,(vec3)(1.0f))-1.0f; +} + +__kernel void compiler_nautilus(__global uint *dst, float resx, float resy, int w) +{ + vec2 gl_FragCoord = (vec2)(get_global_id(0), get_global_id(1)); + vec2 c=-1.0f+2.0f*gl_FragCoord.xy/(vec2)(resx,resy); + vec3 o=(vec3)(c.x,c.y,0.0f),g=(vec3)(c.x,c.y,1.0f)/64.0f,v=(vec3)(0.5f); + float m = 0.4f; + + for(int r=0;r<100;r++) + { + float h=e(o)-m; + if(h<0.0f)break; + o+=h*10.0f*g; + v+=h*0.02f; + } + // light (who needs a normal?) + v+=e(o+0.1f)*(vec3)(0.4f,0.7f,1.0f); + + // ambient occlusion + float a=0.0f; + for(int q=0;q<100;q++) + { + float l = e(o+0.5f*(vec3)(cos(1.1f*(float)(q)),cos(1.6f*(float)(q)),cos(1.4f*(float)(q))))-m; + a+=clamp(4.0f*l,0.0f,1.0f); + } + v*=a/100.0f; + vec4 gl_FragColor=(vec4)(v,1.0f); + OUTPUT; +} + diff --git a/kernels/compiler_nautilus_ref.bmp b/kernels/compiler_nautilus_ref.bmp new file mode 100644 index 0000000..9d2dd96 Binary files /dev/null and b/kernels/compiler_nautilus_ref.bmp differ diff --git a/utests/compiler_clod.cpp b/utests/compiler_clod.cpp deleted file mode 100644 index e68b49f..0000000 --- a/utests/compiler_clod.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright © 2012 Intel Corporation - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library. If not, see . - * - * Author: Benjamin Segovia - */ - -#include "compiler_shader_toy.hpp" - -static void compiler_clod(void) { run_kernel(256, 256, "compiler_clod"); } -MAKE_UTEST_FROM_FUNCTION(compiler_clod); - diff --git a/utests/compiler_ribbon.cpp b/utests/compiler_ribbon.cpp deleted file mode 100644 index 6179d1f..0000000 --- a/utests/compiler_ribbon.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright © 2012 Intel Corporation - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library. If not, see . - * - * Author: Benjamin Segovia - */ - -#include "compiler_shader_toy.hpp" - -static void compiler_ribbon(void) { run_kernel(256, 256, "compiler_ribbon"); } -MAKE_UTEST_FROM_FUNCTION(compiler_ribbon); - diff --git a/utests/compiler_shader_toy.cpp b/utests/compiler_shader_toy.cpp index c89904a..d815df2 100644 --- a/utests/compiler_shader_toy.cpp +++ b/utests/compiler_shader_toy.cpp @@ -27,9 +27,11 @@ * The code here is just to wrap the common code used by all the kernels (to run * the code and assert its correctness) */ -#include "compiler_shader_toy.hpp" +#include "utest_helper.hpp" -void run_kernel(int w, int h, const char *name) +static const int dim = 256; + +static void run_kernel(int w, int h, const char *name) { const size_t global[2] = {size_t(w), size_t(h)}; const size_t local[2] = {16, 1}; @@ -65,8 +67,11 @@ void run_kernel(int w, int h, const char *name) static void NAME(void) { run_kernel(W,H,#NAME); } \ MAKE_UTEST_FROM_FUNCTION(NAME); -DECL_SHADER_TOY_TEST(256,256,compiler_clod); -DECL_SHADER_TOY_TEST(256,256,compiler_ribbon); +DECL_SHADER_TOY_TEST(dim,dim,compiler_clod); +DECL_SHADER_TOY_TEST(dim,dim,compiler_ribbon); +DECL_SHADER_TOY_TEST(dim,dim,compiler_chocolux); +DECL_SHADER_TOY_TEST(dim,dim,compiler_nautilus); +DECL_SHADER_TOY_TEST(dim,dim,compiler_menger_sponge); #undef DECL_SHADER_TOY_TEST diff --git a/utests/compiler_shader_toy.hpp b/utests/compiler_shader_toy.hpp deleted file mode 100644 index b292713..0000000 --- a/utests/compiler_shader_toy.hpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright © 2012 Intel Corporation - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library. If not, see . - * - * Author: Benjamin Segovia - */ -#ifndef __COMPILER_SHADER_TOY_HPP__ -#define __COMPILER_SHADER_TOY_HPP__ - -#include "utest_helper.hpp" - -/*! Run and check the output */ -void run_kernel(int w, int h, const char *name); - -#endif /* __COMPILER_SHADER_TOY_HPP__ */ -