From: Benjamin Segovia <benjamin.segovia@intel.com>
Date: Tue, 6 Nov 2012 04:39:43 +0000 (-0800)
Subject: Added three kernels from Inigo website. Be aware that glsl mod functions is not
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=59ceed088697a57dc086a7d268a44a546953582e;p=contrib%2Fbeignet.git

Added three kernels from Inigo website. Be aware that glsl mod functions is not
the same as ocl one... Damn :-)
---

diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 3965843..28e4bc6 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -492,6 +492,9 @@ namespace gbe
     if (dyn_cast<ConstantVector>(CPV))
       CPV = extractConstantElem(CPV, index);
 
+    if (dyn_cast<ConstantAggregateZero>(CPV))
+      return doIt(uint32_t(0));
+
     // Integers
     if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
       Type* Ty = CI->getType();
@@ -517,7 +520,8 @@ namespace gbe
     }
 
     // Floats and doubles
-    switch (CPV->getType()->getTypeID()) {
+    const Type::TypeID typeID = CPV->getType()->getTypeID();
+    switch (typeID) {
       case Type::FloatTyID:
       case Type::DoubleTyID:
       {
@@ -535,6 +539,7 @@ namespace gbe
       break;
       default:
         GBE_ASSERTM(false, "Unsupported constant type");
+        break;
     }
     const uint64_t imm(8);
     return doIt(imm);
diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
index eda7247..19b45d1 100644
--- a/backend/src/ocl_stdlib.h
+++ b/backend/src/ocl_stdlib.h
@@ -123,12 +123,12 @@ INLINE OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
 INLINE OVERLOADABLE float native_tan(float x) {
   return native_sin(x) / native_cos(x);
 }
+#define E 2.71828182845904523536f
+INLINE OVERLOADABLE float native_exp(float x) { return native_powr(E, x); }
+#undef E
 
 // TODO make them actually compliant precision-wise
-#define cos native_cos   // XXX work-around ptx profile: cos already defined
-#define sin native_sin   // XXX work-around ptr profile: sin already defined
 #define sqrt native_sqrt // XXX work-around ptr profile: sin already defined
-
 INLINE OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
 INLINE OVERLOADABLE float fabs(float x) { return __gen_ocl_fabs(x); }
 INLINE OVERLOADABLE float trunc(float x) { return __gen_ocl_rndz(x); }
@@ -136,22 +136,30 @@ INLINE OVERLOADABLE float round(float x) { return __gen_ocl_rnde(x); }
 INLINE OVERLOADABLE float floor(float x) { return __gen_ocl_rndd(x); }
 INLINE OVERLOADABLE float ceil(float x)  { return __gen_ocl_rndu(x); }
 INLINE OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
+INLINE OVERLOADABLE float exp(float x, float y) { return native_exp(x); }
 INLINE OVERLOADABLE float fmod(float x, float y) { return x-y*trunc(x/y); }
 
-// Hack pow is already a builtin
+// TODO use llvm intrinsics definitions
+#define cos native_cos
+#define sin native_sin
 #define pow powr
 
 PURE CONST OVERLOADABLE float mad(float a, float b, float c);
-OVERLOADABLE INLINE uint select(uint src0, uint src1, uint cond) {
+
+INLINE OVERLOADABLE uint select(uint src0, uint src1, uint cond) {
+  return cond ? src1 : src0;
+}
+INLINE OVERLOADABLE int select(int src0, int src1, int cond) {
   return cond ? src1 : src0;
 }
-OVERLOADABLE INLINE int select(int src0, int src1, int cond) {
+INLINE OVERLOADABLE float select(float src0, float src1, int cond) {
   return cond ? src1 : src0;
 }
 
+
 // This will be optimized out by LLVM and will output LLVM select instructions
 #define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \
-OVERLOADABLE INLINE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \
+INLINE OVERLOADABLE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \
   TYPE4 dst; \
   const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \
   const TYPE x1 = src1.x; \
@@ -171,19 +179,6 @@ DECL_SELECT4(int4, int, int4, 0x80000000)
 DECL_SELECT4(float4, float, int4, 0x80000000)
 #undef DECL_SELECT4
 
-#if 0
-INLINE OVERLOADABLE float2 mad(float2 a, float2 b, float2 c) {
-  return (float2)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y));
-}
-INLINE OVERLOADABLE float3 mad(float3 a, float3 b, float3 c) {
-  return (float3)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z));
-}
-INLINE OVERLOADABLE float4 mad(float4 a, float4 b, float4 c) {
-  return (float4)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y),
-                  mad(a.z,b.z,c.z), mad(a.w,b.w,c.w));
-}
-#endif
-
 /////////////////////////////////////////////////////////////////////////////
 // Common Functions (see 6.11.4 of OCL 1.1 spec)
 /////////////////////////////////////////////////////////////////////////////
@@ -203,6 +198,8 @@ DECL_MIN_MAX(unsigned short)
 DECL_MIN_MAX(unsigned char)
 #undef DECL_MIN_MAX
 
+INLINE OVERLOADABLE float fmax(float a, float b) { return max(a,b); }
+INLINE OVERLOADABLE float fmin(float a, float b) { return min(a,b); }
 INLINE OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}
 
 /////////////////////////////////////////////////////////////////////////////
@@ -411,68 +408,12 @@ INLINE OVERLOADABLE float8 mix(float8 x, float8 y, float a) { return mix(x,y,(fl
 INLINE OVERLOADABLE float16 mix(float16 x, float16 y, float a) { return mix(x,y,(float16)(a));}
 
 /////////////////////////////////////////////////////////////////////////////
-// Extensions to manipulate the register file
-/////////////////////////////////////////////////////////////////////////////
-
-// Direct addressing register regions
-OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int);
-OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int);
-OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int);
-OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int);
-OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int, int);
-OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int, int, int);
-OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int, int, int, int);
-OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int, int, int, int, int);
-
-// Gather from register file
-OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int);
-OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int);
-OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int);
-OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int);
-OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int, int);
-OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int, int, int);
-OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int, int, int, int);
-OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int, int, int, int, int);
-
-/////////////////////////////////////////////////////////////////////////////
-// Extension to have uniform condition per hardware thread
-/////////////////////////////////////////////////////////////////////////////
-
-OVERLOADABLE unsigned short __gen_ocl_any(unsigned short cond);
-OVERLOADABLE unsigned short __gen_ocl_all(unsigned short cond);
-
-/////////////////////////////////////////////////////////////////////////////
-// Extension to support OBlock reads / writes
-/////////////////////////////////////////////////////////////////////////////
-
-OVERLOADABLE int  __gen_ocl_obread(const __global void *address);
-OVERLOADABLE int  __gen_ocl_obread(const __constant void *address);
-OVERLOADABLE int  __gen_ocl_obread(const __local void *address);
-OVERLOADABLE void  __gen_ocl_obwrite(const __global void *address, int);
-OVERLOADABLE void  __gen_ocl_obwrite(const __local void *address, int);
-
-/////////////////////////////////////////////////////////////////////////////
 // Force the compilation to SIMD8 or SIMD16
 /////////////////////////////////////////////////////////////////////////////
 
 int __gen_ocl_force_simd8(void);
 int __gen_ocl_force_simd16(void);
 
-#define DECL_VOTE(TYPE) \
-INLINE OVERLOADABLE TYPE __gen_ocl_any(TYPE cond) { \
-  return (TYPE) __gen_ocl_any((unsigned short) cond); \
-} \
-INLINE OVERLOADABLE TYPE __gen_ocl_all(TYPE cond) { \
-  return (TYPE) __gen_ocl_all((unsigned short) cond); \
-}
-DECL_VOTE(unsigned int)
-DECL_VOTE(unsigned char)
-DECL_VOTE(int)
-DECL_VOTE(char)
-DECL_VOTE(short)
-DECL_VOTE(bool)
-#undef DECL_VOTE
-
 #define NULL ((void*)0)
 #undef PURE
 #undef CONST
diff --git a/backend/src/ocl_stdlib_str.cpp b/backend/src/ocl_stdlib_str.cpp
index b47e7ef..d51af41 100644
--- a/backend/src/ocl_stdlib_str.cpp
+++ b/backend/src/ocl_stdlib_str.cpp
@@ -126,12 +126,12 @@ std::string ocl_stdlib_str =
 "INLINE OVERLOADABLE float native_tan(float x) {\n"
 "  return native_sin(x) / native_cos(x);\n"
 "}\n"
+"#define E 2.71828182845904523536f\n"
+"INLINE OVERLOADABLE float native_exp(float x) { return native_powr(E, x); }\n"
+"#undef E\n"
 "\n"
 "// TODO make them actually compliant precision-wise\n"
-"#define cos native_cos   // XXX work-around ptx profile: cos already defined\n"
-"#define sin native_sin   // XXX work-around ptr profile: sin already defined\n"
 "#define sqrt native_sqrt // XXX work-around ptr profile: sin already defined\n"
-"\n"
 "INLINE OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }\n"
 "INLINE OVERLOADABLE float fabs(float x) { return __gen_ocl_fabs(x); }\n"
 "INLINE OVERLOADABLE float trunc(float x) { return __gen_ocl_rndz(x); }\n"
@@ -139,22 +139,30 @@ std::string ocl_stdlib_str =
 "INLINE OVERLOADABLE float floor(float x) { return __gen_ocl_rndd(x); }\n"
 "INLINE OVERLOADABLE float ceil(float x)  { return __gen_ocl_rndu(x); }\n"
 "INLINE OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }\n"
+"INLINE OVERLOADABLE float exp(float x, float y) { return native_exp(x); }\n"
 "INLINE OVERLOADABLE float fmod(float x, float y) { return x-y*trunc(x/y); }\n"
 "\n"
-"// Hack pow is already a builtin\n"
+"// TODO use llvm intrinsics definitions\n"
+"#define cos native_cos\n"
+"#define sin native_sin\n"
 "#define pow powr\n"
 "\n"
 "PURE CONST OVERLOADABLE float mad(float a, float b, float c);\n"
-"OVERLOADABLE INLINE uint select(uint src0, uint src1, uint cond) {\n"
+"\n"
+"INLINE OVERLOADABLE uint select(uint src0, uint src1, uint cond) {\n"
+"  return cond ? src1 : src0;\n"
+"}\n"
+"INLINE OVERLOADABLE int select(int src0, int src1, int cond) {\n"
 "  return cond ? src1 : src0;\n"
 "}\n"
-"OVERLOADABLE INLINE int select(int src0, int src1, int cond) {\n"
+"INLINE OVERLOADABLE float select(float src0, float src1, int cond) {\n"
 "  return cond ? src1 : src0;\n"
 "}\n"
 "\n"
+"\n"
 "// This will be optimized out by LLVM and will output LLVM select instructions\n"
 "#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \\\n"
-"OVERLOADABLE INLINE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \\\n"
+"INLINE OVERLOADABLE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \\\n"
 "  TYPE4 dst; \\\n"
 "  const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \\\n"
 "  const TYPE x1 = src1.x; \\\n"
@@ -174,19 +182,6 @@ std::string ocl_stdlib_str =
 "DECL_SELECT4(float4, float, int4, 0x80000000)\n"
 "#undef DECL_SELECT4\n"
 "\n"
-"#if 0\n"
-"INLINE OVERLOADABLE float2 mad(float2 a, float2 b, float2 c) {\n"
-"  return (float2)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y));\n"
-"}\n"
-"INLINE OVERLOADABLE float3 mad(float3 a, float3 b, float3 c) {\n"
-"  return (float3)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z));\n"
-"}\n"
-"INLINE OVERLOADABLE float4 mad(float4 a, float4 b, float4 c) {\n"
-"  return (float4)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y),\n"
-"                  mad(a.z,b.z,c.z), mad(a.w,b.w,c.w));\n"
-"}\n"
-"#endif\n"
-"\n"
 "/////////////////////////////////////////////////////////////////////////////\n"
 "// Common Functions (see 6.11.4 of OCL 1.1 spec)\n"
 "/////////////////////////////////////////////////////////////////////////////\n"
@@ -206,6 +201,8 @@ std::string ocl_stdlib_str =
 "DECL_MIN_MAX(unsigned char)\n"
 "#undef DECL_MIN_MAX\n"
 "\n"
+"INLINE OVERLOADABLE float fmax(float a, float b) { return max(a,b); }\n"
+"INLINE OVERLOADABLE float fmin(float a, float b) { return min(a,b); }\n"
 "INLINE OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}\n"
 "\n"
 "/////////////////////////////////////////////////////////////////////////////\n"
@@ -414,68 +411,12 @@ std::string ocl_stdlib_str =
 "INLINE OVERLOADABLE float16 mix(float16 x, float16 y, float a) { return mix(x,y,(float16)(a));}\n"
 "\n"
 "/////////////////////////////////////////////////////////////////////////////\n"
-"// Extensions to manipulate the register file\n"
-"/////////////////////////////////////////////////////////////////////////////\n"
-"\n"
-"// Direct addressing register regions\n"
-"OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int);\n"
-"OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int);\n"
-"OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int);\n"
-"OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int);\n"
-"OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int, int);\n"
-"OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int, int, int);\n"
-"OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int, int, int, int);\n"
-"OVERLOADABLE int __gen_ocl_region(int offset, int vstride, int width, int hstride, int, int, int, int, int, int, int, int);\n"
-"\n"
-"// Gather from register file\n"
-"OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int);\n"
-"OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int);\n"
-"OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int);\n"
-"OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int);\n"
-"OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int, int);\n"
-"OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int, int, int);\n"
-"OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int, int, int, int);\n"
-"OVERLOADABLE int __gen_ocl_rgather(unsigned short index, int, int, int, int, int, int, int, int);\n"
-"\n"
-"/////////////////////////////////////////////////////////////////////////////\n"
-"// Extension to have uniform condition per hardware thread\n"
-"/////////////////////////////////////////////////////////////////////////////\n"
-"\n"
-"OVERLOADABLE unsigned short __gen_ocl_any(unsigned short cond);\n"
-"OVERLOADABLE unsigned short __gen_ocl_all(unsigned short cond);\n"
-"\n"
-"/////////////////////////////////////////////////////////////////////////////\n"
-"// Extension to support OBlock reads / writes\n"
-"/////////////////////////////////////////////////////////////////////////////\n"
-"\n"
-"OVERLOADABLE int  __gen_ocl_obread(const __global void *address);\n"
-"OVERLOADABLE int  __gen_ocl_obread(const __constant void *address);\n"
-"OVERLOADABLE int  __gen_ocl_obread(const __local void *address);\n"
-"OVERLOADABLE void  __gen_ocl_obwrite(const __global void *address, int);\n"
-"OVERLOADABLE void  __gen_ocl_obwrite(const __local void *address, int);\n"
-"\n"
-"/////////////////////////////////////////////////////////////////////////////\n"
 "// Force the compilation to SIMD8 or SIMD16\n"
 "/////////////////////////////////////////////////////////////////////////////\n"
 "\n"
 "int __gen_ocl_force_simd8(void);\n"
 "int __gen_ocl_force_simd16(void);\n"
 "\n"
-"#define DECL_VOTE(TYPE) \\\n"
-"INLINE OVERLOADABLE TYPE __gen_ocl_any(TYPE cond) { \\\n"
-"  return (TYPE) __gen_ocl_any((unsigned short) cond); \\\n"
-"} \\\n"
-"INLINE OVERLOADABLE TYPE __gen_ocl_all(TYPE cond) { \\\n"
-"  return (TYPE) __gen_ocl_all((unsigned short) cond); \\\n"
-"}\n"
-"DECL_VOTE(unsigned int)\n"
-"DECL_VOTE(unsigned char)\n"
-"DECL_VOTE(int)\n"
-"DECL_VOTE(char)\n"
-"DECL_VOTE(short)\n"
-"DECL_VOTE(bool)\n"
-"#undef DECL_VOTE\n"
-"\n"
 "#define NULL ((void*)0)\n"
 "#undef PURE\n"
 "#undef CONST\n"
diff --git a/kernels/compiler_chocolux.cl b/kernels/compiler_chocolux.cl
new file mode 100644
index 0000000..e3bbd64
--- /dev/null
+++ b/kernels/compiler_chocolux.cl
@@ -0,0 +1,65 @@
+typedef float2 vec2;
+typedef float3 vec3;
+typedef float4 vec4;
+
+#define sin native_sin
+#define cos native_cos
+#define tan native_tan
+#define exp native_exp
+#define normalize fast_normalize
+#define length fast_length
+#define mod fmod
+#define time 10.f
+
+inline vec3 reflect(vec3 I, vec3 N) {
+  return I - 2.0f * dot(N, I) * N;
+}
+
+inline uint pack_fp4(float4 u4) {
+  uint u;
+  u = (((uint) u4.x)) |
+      (((uint) u4.y) << 8) |
+      (((uint) u4.z) << 16);
+  return u;
+}
+
+#define OUTPUT do {\
+  const vec4 final = 255.f * max(min(gl_FragColor, (vec4)(1.f)), (vec4)(0.f)); \
+  dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final); \
+} while (0)
+
+__kernel void compiler_chocolux(__global uint *dst, float resx, float resy, int w)
+{
+  vec2 gl_FragCoord = (vec2)(get_global_id(0), get_global_id(1));
+  vec3 s[4];
+  s[0]=(vec3)(0);
+  s[3]=(vec3)(sin(time),cos(time),0);
+  s[1]=s[3].zxy;
+  s[2]=s[3].zzx;
+
+  float t,b,c,h=0.0f;
+  vec3 m,n;
+  vec3 p=(vec3)(.2f);
+  vec3 d=normalize(.001f*(vec3)(gl_FragCoord,.0f)-p);
+
+  for(int i=0;i<4;i++)
+  {
+    t=2.0f;
+    for(int i=0;i<4;i++)
+    {
+      b=dot(d,n=s[i]-p);
+      c=b*b+.2f-dot(n,n);
+      if(b-c<t)
+      if(c>0.0f)
+      {
+        m=s[i];t=b-c;
+      }
+    }
+    p+=t*d;
+    d=reflect(d,n=normalize(p-m));
+    h+=pow(n.x*n.x,44.f)+n.x*n.x*.2f;
+  }
+  vec4 gl_FragColor=(vec4)(h,h*h,h*h*h*h,1.f);
+  OUTPUT;
+}
+
diff --git a/kernels/compiler_chocolux_ref.bmp b/kernels/compiler_chocolux_ref.bmp
new file mode 100644
index 0000000..e51a4a7
Binary files /dev/null and b/kernels/compiler_chocolux_ref.bmp differ
diff --git a/kernels/compiler_menger_sponge.cl b/kernels/compiler_menger_sponge.cl
new file mode 100644
index 0000000..1827f68
--- /dev/null
+++ b/kernels/compiler_menger_sponge.cl
@@ -0,0 +1,188 @@
+// See http://www.iquilezles.org/articles/menger/menger.htm for the 
+// full explanation of how this was done
+
+typedef float2 vec2;
+typedef float3 vec3;
+typedef float4 vec4;
+
+#define sin native_sin
+#define cos native_cos
+#define tan native_tan
+#define exp native_exp
+#define normalize fast_normalize
+#define length fast_length
+#define mod fmod
+#define time 1.f
+
+// fmod is not like glsl mod!
+__attribute__((always_inline, overloadable))
+float glsl_mod(float x,float y) { return x-y*floor(x/y); }
+__attribute__((always_inline, overloadable))
+float2 glsl_mod(float2 a,float2 b) { return (float2)(glsl_mod(a.x,b.x), glsl_mod(a.y,b.y)); }
+__attribute__((always_inline, overloadable))
+float3 glsl_mod(float3 a,float3 b) { return (float3)(glsl_mod(a.x,b.x), glsl_mod(a.y,b.y), glsl_mod(a.z,b.z)); }
+
+inline vec3 reflect(vec3 I, vec3 N) {
+  return I - 2.0f * dot(N, I) * N;
+}
+
+inline float clamp(x,m,M) { return max(min(x,M),m); }
+
+inline uint pack_fp4(float4 u4) {
+  uint u;
+  u = (((uint) u4.x)) |
+      (((uint) u4.y) << 8) |
+      (((uint) u4.z) << 16);
+  return u;
+}
+
+#define OUTPUT do {\
+  const vec4 final = 255.f * max(min(gl_FragColor, (vec4)(1.f)), (vec4)(0.f)); \
+  dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final); \
+} while (0)
+
+__attribute__((always_inline))
+float maxcomp(vec3 p) { return max(p.x,max(p.y,p.z));}
+
+__attribute__((always_inline))
+float sdBox(vec3 p, vec3 b)
+{
+  vec3  di = fabs(p) - b;
+  float mc = maxcomp(di);
+  return min(mc,length(max(di,0.0f)));
+}
+
+__attribute__((always_inline))
+vec4 map(vec3 p)
+{
+   float d = sdBox(p,(vec3)(1.0f));
+   float4 res = (vec4)(d,1.f,0.f,0.f);
+
+   float s = 1.0f;
+   for( int m=0; m<3; m++ ) 
+   {
+      vec3 a = glsl_mod(p*s, 2.0f)-1.0f;
+      s *= 3.0f;
+      float rx = fabs(1.0f - 3.0f*fabs(a.x));
+      float ry = fabs(1.0f - 3.0f*fabs(a.y));
+      float rz = fabs(1.0f - 3.0f*fabs(a.z));
+
+      float da = max(rx,ry);
+      float db = max(ry,rz);
+      float dc = max(rz,rx);
+      float c = (min(da,min(db,dc))-1.0f)/s;
+      if (c > d)
+      {
+          d = c;
+          res = (vec4)(d, 0.2f*da*db*dc, (1.0f+(float)(m))/4.0f, 0.0f);
+      }
+   }
+   return (vec4)(res.x,res.y,res.z,0.f);
+}
+
+// GLSL ES doesn't seem to like loops with conditional break/return...
+#if 1
+__attribute__((always_inline))
+vec4 intersect( vec3 ro, vec3 rd )
+{
+    float t = 0.0f;
+    for(int i=0;i<64;i++)
+    {
+        vec4 h = map(ro + rd*t);
+        if( h.x<0.002f )
+            return (vec4)(t,h.yzw);
+        t += h.x;
+    }
+    return (vec4)(-1.0f);
+}
+#else
+__attribute__((always_inline))
+vec4 intersect( vec3 ro, vec3 rd )
+{
+    float t = 0.0f;
+    vec4 res = (vec4)(-1.0f);
+    for(int i=0;i<64;i++)
+    {
+        vec4 h = map(ro + rd*t);
+        if (h.x<0.002f)
+        {
+            if(res.x<0.0f) res = (vec4)(t,h.yzw);
+        }
+        t += h.x;
+    }
+    return res;
+}
+#endif
+
+__attribute__((always_inline))
+vec3 calcNormal(vec3 pos)
+{
+    vec3 eps = (vec3)(.001f,0.0f,0.0f);
+    vec3 nor;
+    nor.x = map(pos+eps.xyy).x - map(pos-eps.xyy).x;
+    nor.y = map(pos+eps.yxy).x - map(pos-eps.yxy).x;
+    nor.z = map(pos+eps.yyx).x - map(pos-eps.yyx).x;
+    return normalize(nor);
+}
+
+__kernel void compiler_menger_sponge(__global uint *dst, float resx, float resy, int w)
+{
+    vec2 gl_FragCoord = (vec2)(get_global_id(0), get_global_id(1));
+    vec2 p=-1.0f+2.0f*gl_FragCoord.xy/(vec2)(resx,resy);
+
+    // light
+    vec3 light = normalize((vec3)(1.0f,0.8f,-0.6f));
+
+    float ctime = time;
+    // camera
+    vec3 ro = 1.1f*(vec3)(2.5f*cos(0.5f*ctime),1.5f*cos(ctime*.23f),2.5f*sin(0.5f*ctime));
+    vec3 ww = normalize((vec3)(0.0f) - ro);
+    vec3 uu = normalize(cross( (vec3)(0.0f,1.0f,0.0f), ww ));
+    vec3 vv = normalize(cross(ww,uu));
+    vec3 rd = normalize( p.x*uu + p.y*vv + 1.5f*ww );
+    vec3 col = (vec3)(0.0f);
+    vec4 tmat = intersect(ro,rd);
+
+#if 0
+    if( tmat.x>0.0 )
+        col = (vec3)(
+            0.6f+0.4f*cos(5.0f+6.2831f*tmat.z),
+            0.6f+0.4f*cos(5.4f+6.2831f*tmat.z),
+            0.6f+0.4f*cos(5.7f+6.2831f*tmat.z) );
+    
+#else
+    if( tmat.x>0.0f )
+    {
+        vec3 pos = ro + tmat.x*rd;
+        vec3 nor = calcNormal(pos);
+
+        float dif1 = max(0.4f + 0.6f*dot(nor,light),0.0f);
+        float dif2 = max(0.4f + 0.6f*dot(nor,(vec3)(-light.x,light.y,-light.z)),0.0f);
+
+        // shadow
+        float ldis = 4.0f;
+        vec4 shadow = intersect( pos + light*ldis, -light );
+        if( shadow.x>0.0f && shadow.x<(ldis-0.01f) ) dif1=0.0f;
+
+        float ao = tmat.y;
+        col  = 1.0f*ao*(vec3) (0.2f,0.2f,0.2f);
+        col += 2.0f*(0.5f+0.5f*ao)*dif1*(vec3)(1.0f,0.97f,0.85f);
+        col += 0.2f*(0.5f+0.5f*ao)*dif2*(vec3)(1.0f,0.97f,0.85f);
+        col += 1.0f*(0.5f+0.5f*ao)*(0.5f+0.5f*nor.y)*(vec3)(0.1f,0.15f,0.2f);
+
+        // gamma lighting
+        col = col*0.5f+0.5f*sqrt(col)*1.2f;
+
+        vec3 matcol = (vec3)(
+            0.6f+0.4f*cos(5.0f+6.2831f*tmat.z),
+            0.6f+0.4f*cos(5.4f+6.2831f*tmat.z),
+            0.6f+0.4f*cos(5.7f+6.2831f*tmat.z) );
+        col *= matcol;
+        col *= 1.5f*exp(-0.5f*tmat.x);
+    }
+#endif
+
+  vec4 gl_FragColor = (vec4)(col,1.0f);
+  OUTPUT;
+}
+
diff --git a/kernels/compiler_menger_sponge_ref.bmp b/kernels/compiler_menger_sponge_ref.bmp
new file mode 100644
index 0000000..911289f
Binary files /dev/null and b/kernels/compiler_menger_sponge_ref.bmp differ
diff --git a/kernels/compiler_nautilus.cl b/kernels/compiler_nautilus.cl
new file mode 100644
index 0000000..90e44ef
--- /dev/null
+++ b/kernels/compiler_nautilus.cl
@@ -0,0 +1,69 @@
+typedef float2 vec2;
+typedef float3 vec3;
+typedef float4 vec4;
+
+#define sin native_sin
+#define cos native_cos
+#define tan native_tan
+#define exp native_exp
+#define normalize fast_normalize
+#define length fast_length
+#define mod fmod
+#define time 1.f
+
+inline vec3 reflect(vec3 I, vec3 N) {
+  return I - 2.0f * dot(N, I) * N;
+}
+
+inline float clamp(x,m,M) { return max(min(x,M),m); }
+
+inline uint pack_fp4(float4 u4) {
+  uint u;
+  u = (((uint) u4.x)) |
+      (((uint) u4.y) << 8) |
+      (((uint) u4.z) << 16);
+  return u;
+}
+
+#define OUTPUT do {\
+  const vec4 final = 255.f * max(min(gl_FragColor, (vec4)(1.f)), (vec4)(0.f)); \
+  dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final); \
+} while (0)
+
+inline float e(vec3 c)
+{
+    c=cos((vec3)(cos(c.x+time/6.0f)*c.x-cos(c.y*3.0f+time/5.0f)*c.y,
+                 cos(time/4.0f)*c.z/3.0f*c.x-cos(time/7.0f)*c.y,
+                 c.x+c.y+c.z+time));
+    return dot(c*c,(vec3)(1.0f))-1.0f;
+}
+
+__kernel void compiler_nautilus(__global uint *dst, float resx, float resy, int w)
+{
+  vec2 gl_FragCoord = (vec2)(get_global_id(0), get_global_id(1));
+  vec2 c=-1.0f+2.0f*gl_FragCoord.xy/(vec2)(resx,resy);
+  vec3 o=(vec3)(c.x,c.y,0.0f),g=(vec3)(c.x,c.y,1.0f)/64.0f,v=(vec3)(0.5f);
+  float m = 0.4f;
+
+  for(int r=0;r<100;r++)
+  {
+    float h=e(o)-m;
+    if(h<0.0f)break;
+    o+=h*10.0f*g;
+    v+=h*0.02f;
+  }
+  // light (who needs a normal?)
+  v+=e(o+0.1f)*(vec3)(0.4f,0.7f,1.0f);
+
+  // ambient occlusion
+  float a=0.0f;
+  for(int q=0;q<100;q++)
+  {
+     float l = e(o+0.5f*(vec3)(cos(1.1f*(float)(q)),cos(1.6f*(float)(q)),cos(1.4f*(float)(q))))-m;
+     a+=clamp(4.0f*l,0.0f,1.0f);
+  }
+  v*=a/100.0f;
+  vec4 gl_FragColor=(vec4)(v,1.0f);
+  OUTPUT;
+}
+
diff --git a/kernels/compiler_nautilus_ref.bmp b/kernels/compiler_nautilus_ref.bmp
new file mode 100644
index 0000000..9d2dd96
Binary files /dev/null and b/kernels/compiler_nautilus_ref.bmp differ
diff --git a/utests/compiler_clod.cpp b/utests/compiler_clod.cpp
deleted file mode 100644
index e68b49f..0000000
--- a/utests/compiler_clod.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/* 
- * Copyright Â© 2012 Intel Corporation
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- *
- * Author: Benjamin Segovia <benjamin.segovia@intel.com>
- */
-
-#include "compiler_shader_toy.hpp"
-
-static void compiler_clod(void) { run_kernel(256, 256, "compiler_clod"); }
-MAKE_UTEST_FROM_FUNCTION(compiler_clod);
-
diff --git a/utests/compiler_ribbon.cpp b/utests/compiler_ribbon.cpp
deleted file mode 100644
index 6179d1f..0000000
--- a/utests/compiler_ribbon.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/* 
- * Copyright Â© 2012 Intel Corporation
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- *
- * Author: Benjamin Segovia <benjamin.segovia@intel.com>
- */
-
-#include "compiler_shader_toy.hpp"
-
-static void compiler_ribbon(void) { run_kernel(256, 256, "compiler_ribbon"); }
-MAKE_UTEST_FROM_FUNCTION(compiler_ribbon);
-
diff --git a/utests/compiler_shader_toy.cpp b/utests/compiler_shader_toy.cpp
index c89904a..d815df2 100644
--- a/utests/compiler_shader_toy.cpp
+++ b/utests/compiler_shader_toy.cpp
@@ -27,9 +27,11 @@
  * The code here is just to wrap the common code used by all the kernels (to run
  * the code and assert its correctness)
  */
-#include "compiler_shader_toy.hpp"
+#include "utest_helper.hpp"
 
-void run_kernel(int w, int h, const char *name)
+static const int dim = 256;
+
+static void run_kernel(int w, int h, const char *name)
 {
   const size_t global[2] = {size_t(w), size_t(h)};
   const size_t local[2] = {16, 1};
@@ -65,8 +67,11 @@ void run_kernel(int w, int h, const char *name)
   static void NAME(void) { run_kernel(W,H,#NAME); } \
   MAKE_UTEST_FROM_FUNCTION(NAME);
 
-DECL_SHADER_TOY_TEST(256,256,compiler_clod);
-DECL_SHADER_TOY_TEST(256,256,compiler_ribbon);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_clod);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_ribbon);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_chocolux);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_nautilus);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_menger_sponge);
 
 #undef DECL_SHADER_TOY_TEST
 
diff --git a/utests/compiler_shader_toy.hpp b/utests/compiler_shader_toy.hpp
deleted file mode 100644
index b292713..0000000
--- a/utests/compiler_shader_toy.hpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/* 
- * Copyright Â© 2012 Intel Corporation
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- *
- * Author: Benjamin Segovia <benjamin.segovia@intel.com>
- */
-#ifndef __COMPILER_SHADER_TOY_HPP__
-#define __COMPILER_SHADER_TOY_HPP__
-
-#include "utest_helper.hpp"
-
-/*! Run and check the output */
-void run_kernel(int w, int h, const char *name);
-
-#endif /* __COMPILER_SHADER_TOY_HPP__ */
-