From c2e1de5a51d3a07cfeb1be61fad90a39f0e4dd84 Mon Sep 17 00:00:00 2001
From: Benjamin Segovia <segovia.benjamin@gmail.com>
Date: Fri, 2 Mar 2012 20:45:26 -0800
Subject: [PATCH] Made first tests pass with vectors

---
 backend/kernels/cycle.cl              |   14 +
 backend/kernels/cycle.ll              |   15 +
 backend/kernels/cycle.o               |  Bin 0 -> 404 bytes
 backend/kernels/get_global_id.cl      |   21 +-
 backend/kernels/mad.cl                |    3 +-
 backend/kernels/mad.ll                |    7 +-
 backend/kernels/mad.o                 |  Bin 884 -> 896 bytes
 backend/kernels/short.cl              |    5 +
 backend/kernels/short.ll              |   17 +
 backend/kernels/short.o               |  Bin 0 -> 504 bytes
 backend/kernels/simple_float4.cl      |    7 +
 backend/kernels/simple_float4.ll      |   21 +
 backend/kernels/simple_float4.o       |  Bin 0 -> 596 bytes
 backend/kernels/simple_float4_2.cl    |    8 +
 backend/kernels/simple_float4_2.ll    |   22 +
 backend/kernels/simple_float4_2.o     |  Bin 0 -> 600 bytes
 backend/kernels/stdlib.h              |  211 +----
 backend/src/ir/context.hpp            |   16 +-
 backend/src/ir/function.hpp           |    2 +-
 backend/src/ir/register.cpp           |    9 +
 backend/src/ir/register.hpp           |    2 +
 backend/src/llvm/llvm_gen_backend.cpp | 1370 ++++++++++-----------------------
 backend/src/llvm/llvm_passes.cpp      |    6 +-
 backend/src/utest/utest_llvm.cpp      |   10 +-
 24 files changed, 598 insertions(+), 1168 deletions(-)
 create mode 100644 backend/kernels/cycle.cl
 create mode 100644 backend/kernels/cycle.ll
 create mode 100644 backend/kernels/cycle.o
 create mode 100644 backend/kernels/short.cl
 create mode 100644 backend/kernels/short.ll
 create mode 100644 backend/kernels/short.o
 create mode 100644 backend/kernels/simple_float4.cl
 create mode 100644 backend/kernels/simple_float4.ll
 create mode 100644 backend/kernels/simple_float4.o
 create mode 100644 backend/kernels/simple_float4_2.cl
 create mode 100644 backend/kernels/simple_float4_2.ll
 create mode 100644 backend/kernels/simple_float4_2.o

diff --git a/backend/kernels/cycle.cl b/backend/kernels/cycle.cl
new file mode 100644
index 0000000..fe9135c
--- /dev/null
+++ b/backend/kernels/cycle.cl
@@ -0,0 +1,14 @@
+__kernel void cycle(global int *dst)
+{
+  int x, y;
+
+hop0:
+  x = y;
+
+hop1:
+  y = x;
+  goto hop0;
+
+  dst[0] = x;
+}
+
diff --git a/backend/kernels/cycle.ll b/backend/kernels/cycle.ll
new file mode 100644
index 0000000..6336300
--- /dev/null
+++ b/backend/kernels/cycle.ll
@@ -0,0 +1,15 @@
+; ModuleID = 'cycle.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_kernel void @cycle(i32* nocapture %dst) noreturn nounwind readnone noinline {
+entry:
+  br label %hop0
+
+hop0:                                             ; preds = %hop0, %entry
+  br label %hop0
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32*)* @cycle}
diff --git a/backend/kernels/cycle.o b/backend/kernels/cycle.o
new file mode 100644
index 0000000000000000000000000000000000000000..b7157d7a6283d68a1a8a2781a72724fe0cb5c092
GIT binary patch
literal 404
zcmZ>AK5$Qwhk+r9fq{WhfPp~>NV7L8Pjo!N;>jjpWI2hkg-4Z@LsE{VJE?)kmC0!m
zw^NFMk%$F{geRAgi;4%=1V!Z|K@B{TibX2yCr$_?tyr+a;n)X8pgEor3=Awln!AI6
z#ZiEz^FI*CpNL@SRA68LTFGFfbb>=6wW)<c&Dn#KQ_cApr?aA_$_yS!agLKS6fE2r
zo^c##2x93l6EI)}DrXSpt60EZGlSVOquK1n2DYac*gh;^FIHf$GGH&d!1gkMy<`D<
znFD(jka2<S87lXk0DDyfdzAot8AJ<E*1$#4kCA~1Xs`l5GY61zW?_<exQE$hO0#tj
zv-OUHR#Td7b~syZIBes?39>&?fq{VwNDH$Z(s4BK7HDB+WB@sgJwV{BNTRrd%sJ=i
z0|p+#Kp`-w1p1#H$TH?=kU8>DCQXFVw!_(?<#3Nv1AAo!dof5+p#lTg32XrzGd2Lj
G9|QnjXJdc>

literal 0
HcmV?d00001

diff --git a/backend/kernels/get_global_id.cl b/backend/kernels/get_global_id.cl
index 299d6c3..010beed 100644
--- a/backend/kernels/get_global_id.cl
+++ b/backend/kernels/get_global_id.cl
@@ -1,23 +1,4 @@
-__attribute__((pure,const)) unsigned int __gen_ocl_get_global_id0(void);
-__attribute__((pure,const)) unsigned int __gen_ocl_get_global_id1(void);
-__attribute__((pure,const)) unsigned int __gen_ocl_get_global_id2(void);
-__attribute__((pure,const)) unsigned int __gen_ocl_get_local_id0(void);
-__attribute__((pure,const)) unsigned int __gen_ocl_get_local_id1(void);
-__attribute__((pure,const)) unsigned int __gen_ocl_get_local_id2(void);
-
-inline unsigned get_global_id(unsigned int dim) {
-  if (dim == 0) return __gen_ocl_get_global_id0();
-  else if (dim == 1) return __gen_ocl_get_global_id1();
-  else if (dim == 2) return __gen_ocl_get_global_id2();
-  else return 0;
-}
-
-inline unsigned get_local_id(unsigned int dim) {
-  if (dim == 0) return __gen_ocl_get_local_id0();
-  else if (dim == 1) return __gen_ocl_get_local_id1();
-  else if (dim == 2) return __gen_ocl_get_local_id2();
-  else return 0;
-}
+#include "stdlib.h"
 
 __kernel void test_global_id(__global int *dst, __global int *p)
 {
diff --git a/backend/kernels/mad.cl b/backend/kernels/mad.cl
index 9b6e36a..9589ff6 100644
--- a/backend/kernels/mad.cl
+++ b/backend/kernels/mad.cl
@@ -10,7 +10,8 @@ __kernel void add(__global int *dst, unsigned int x, float z)
     y = mad(dst[i], 2, 3);
     float z = mad((float) dst[i], 2.f, 3.f);
     float4 z0 = mad((float4) dst[i], (float4)(0.f,1.f,2.f,3.f), (float4)3.f);
-    dst[i] = y + (int) z + z0.x + z0.y + z0.z;
+    float4 x0 = z0 * (float4) 2.f;
+    dst[i] = y + (int) z + x0.x + x0.y + x0.z;
   }
 }
 
diff --git a/backend/kernels/mad.ll b/backend/kernels/mad.ll
index 536fcc4..6bd19da 100644
--- a/backend/kernels/mad.ll
+++ b/backend/kernels/mad.ll
@@ -17,14 +17,15 @@ for.body:                                         ; preds = %for.body, %entry
   %1 = insertelement <4 x float> undef, float %conv, i32 0
   %splat = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
   %call8 = tail call ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %splat, <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>) nounwind readonly
+  %mul = fmul <4 x float> %call8, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
   %conv9 = fptosi float %call5 to i32
   %add = add nsw i32 %conv9, %call2
   %conv10 = sitofp i32 %add to float
-  %2 = extractelement <4 x float> %call8, i32 0
+  %2 = extractelement <4 x float> %mul, i32 0
   %add11 = fadd float %conv10, %2
-  %3 = extractelement <4 x float> %call8, i32 1
+  %3 = extractelement <4 x float> %mul, i32 1
   %add12 = fadd float %add11, %3
-  %4 = extractelement <4 x float> %call8, i32 2
+  %4 = extractelement <4 x float> %mul, i32 2
   %add13 = fadd float %add12, %4
   %conv14 = fptosi float %add13 to i32
   store i32 %conv14, i32* %arrayidx, align 4, !tbaa !1
diff --git a/backend/kernels/mad.o b/backend/kernels/mad.o
index 26aa240908f640f28c87040101b904ddb207b35a..906b7e46e832781dd9672fe58597629009b6e7e9 100644
GIT binary patch
delta 376
zcmeyu*1*o`<b2?sA`b(@-Hn{v8S5h%7#M_EoOJ|%G#?O)xG{7sU^&%rOoEZ4Nr8hc
znMHx4NkyWEp+~Smf#Z?<O34Pxj$@HCEIHZ|XIOTmMv65l1GO<6<~_>DwwcGtfj5LD
zQ$XfG1ykTeQ#T$SMYSgGz;{w+NeLW=iYeU2$3LVTS2+C5N13fnK$w}m-u0?yQIx`O
z0|6Nc9%<(feO%0j4TdcGD-9KxC7eG@&|)@jux@r}78H}>X$a-yXp_FQfT1~(O)-F(
zhn-0UXt7xk!{LDI#{xahGx?|&i1<AbKARxl`oJVKUzGLH<*#{?Ee}klwn{Wz4l&Ku
zXakD(#xn$2Oq91^P7~u0NOWar<Cz{P!u0vdE<v7V6Uh?{Jk5zBOdE8zRr2s`z2V8o
z<6Jx2O_AaBBzXr%li-#}cOd&vG)O!*+Oc4!<Algl4JJ)16^^v}G+hP;D+8k;0|PUV
Qj%HKh*v1n!S(rHy0HZK*6#xJL

delta 364
zcmZo*|H8)U<b2?sA`b(@t&N=98SBFs7#M_EoOJ|%G%pZ~xG{7sU^&%rOoEZ4Nr96s
znMDCeOY|`G2sS8iJd$52*<jgmEODA;M{1;4lQK{T1H)n7ql|2ud7KP*Ls-%UWDZm?
zIR=`#@$e|BF>wc;lQK(6;4oB7IAe7DL&9-|=5HR#Y;6L<huK`Gvu>(?$8@mr@G*fM
zi|q^=3|aJ68VE2;I6s&m#cb4|-R#gTI8~CTA&`@!P5RsdhUQ2%MF(acb|#RuVnGat
z1Fmli^f=Gtqh285_eA(?f^6#plhAw-)<>7W=1H_XFqv8_(R4Y)G*_(+DBc^-5M(h?
z-hw$zltaLkp^ay{uLx7<=PSDed74dT9%tZbjum0rptG%4<bc7IZBs$y_EeC_Peu_J
yW7iplAod<nmg$l#Gw1SnC`)iT#hg&FI135^MneV$W*{BSro^$0Cv>trb0PpIsBYZ=

diff --git a/backend/kernels/short.cl b/backend/kernels/short.cl
new file mode 100644
index 0000000..83e38a0
--- /dev/null
+++ b/backend/kernels/short.cl
@@ -0,0 +1,5 @@
+__kernel void short_write(__global short *dst, short x, short y)
+{
+  dst[0] = x + y;
+}
+
diff --git a/backend/kernels/short.ll b/backend/kernels/short.ll
new file mode 100644
index 0000000..6225107
--- /dev/null
+++ b/backend/kernels/short.ll
@@ -0,0 +1,17 @@
+; ModuleID = 'short.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_kernel void @short_write(i16* nocapture %dst, i16 %x, i16 %y) nounwind noinline {
+entry:
+  %add = add i16 %y, %x
+  store i16 %add, i16* %dst, align 2, !tbaa !1
+  ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i16*, i16, i16)* @short_write}
+!1 = metadata !{metadata !"short", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/short.o b/backend/kernels/short.o
new file mode 100644
index 0000000000000000000000000000000000000000..bee5c62ae4673612b3e9c731edf0141757755e4e
GIT binary patch
literal 504
zcmZ>AK5$Qwhk>D*fq{WhfPp~>NV7L8Pjo!N;>jjpWI2hkg-4Z@LsE{VJE?)kmC0!m
zw^NFMk%$F{geRAgi;4%=1V!Z|K@B{TibX2yCr$_?tyr+a;n)X8pgEor3=Awln!AI6
z#ZiEz^FI*C9|&ORQ~-(r#f_9sa44iUwJ@kTdvJ29IUnP6R@79P!6PZoadL)&fg8g!
z4h1j81ubSkcM0=VEMTvh!EBk)Y<6P<+tUkd9~Q6|E3j7?uoqomdzrvqvVgtJfxQaI
zxWM)dmHSSBy{dt|N`So#q6H{x;G*cq$iM`2gaSXaG>~#;VUlnXaWd4=c*ZDu&r$A0
zB1fQr>&6F_?PUe+B@ON63)+hc*sB<t3mw`ECbTQMBr+Um$`HG>Q1(Hi>?uYbw}-rL
ziM-bscrqMCoD@aeEZ#HlKYhUW_yAwt2EL~X{O<zz-Wc!)gKT6-RA68LdWk`p#gUhZ
z$Bm_hA;AK~X9Hq41BSy1jE5C`*#iX5iX^HD#Fz;r0L2*%8NkksZd2d@a>1Yy7{2U4
zmN7?z%#nvOX(Eia9nKalhkKkF*efg8i$RJC6&M(pfi&9&k9J|-{i2G(pqOD`0050(
Bfg}I`

literal 0
HcmV?d00001

diff --git a/backend/kernels/simple_float4.cl b/backend/kernels/simple_float4.cl
new file mode 100644
index 0000000..7b47a18
--- /dev/null
+++ b/backend/kernels/simple_float4.cl
@@ -0,0 +1,7 @@
+#include "stdlib.h"
+
+__kernel void simple_float4(__global float4 *dst, __global float4 *src)
+{
+  dst[get_global_id(0)] = src[get_global_id(0)];
+}
+
diff --git a/backend/kernels/simple_float4.ll b/backend/kernels/simple_float4.ll
new file mode 100644
index 0000000..0e05208
--- /dev/null
+++ b/backend/kernels/simple_float4.ll
@@ -0,0 +1,21 @@
+; ModuleID = 'simple_float4.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_kernel void @simple_float4(<4 x float>* nocapture %dst, <4 x float>* nocapture %src) nounwind noinline {
+get_global_id.exit5:
+  %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+  %arrayidx = getelementptr inbounds <4 x float>* %src, i32 %call.i
+  %0 = load <4 x float>* %arrayidx, align 16, !tbaa !1
+  %arrayidx2 = getelementptr inbounds <4 x float>* %dst, i32 %call.i
+  store <4 x float> %0, <4 x float>* %arrayidx2, align 16, !tbaa !1
+  ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x float>*, <4 x float>*)* @simple_float4}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/simple_float4.o b/backend/kernels/simple_float4.o
new file mode 100644
index 0000000000000000000000000000000000000000..62e522df83f688d228864cf4fde651c58f9f7ab4
GIT binary patch
literal 596
zcmZ>AK5$Qwhk;=d0|Nu200V;%kY;aGp6Gaj#gk3I$Z`^63y&%*hol@!cT$6ZE0fbC
zZl@LjBM}P@2~RE~7Zne#35v=`f*N=v6^m5ZPjCn(tyr+a;n)X8pgEor3=AAVn!AUA
z#ZiETlMzIA{s#j20|5~Ve?Tk-HwK1I1qKE_pgJQZ4o-#CrWOV@7Y|NOHJ4+Y&Wf5U
zGXx~XIXGu31h_GzF|-MGG%VpUa5iygU{f{N)d<wZz#z<5v4Fj12D4>Gv)PReY)>z+
zeOSO=tiWDnz+QBL?PUUc$pZE=2lgr;;{w|=RPH+g_NoT<DgpK~h!&u%fs3XeBLfrA
z;DnB*1x{cuILNUvFo*+%oLiVAoK&2QbSmyGl)a-U`{1GMi-j_G8s!co$XrmAd6LL6
zQGj8C!mQSk4EBnI_9BMnz>4O;h-Om;<|u*I0FG9{bJho(Eq5HY*w8G!#@X^jv-K2a
z>obR^0L@NRU|^62(!wkUI}R!waWR-@AP_R;BTJ)1k3|pDfdZg311}JVH82FSB%CO?
zc|gIniIb<<M6ywl;q)WrvI$Ueb_2;HU!=Q*8BX`}*cB=U0|S@Qkb!|2s5hETkz*T=
z3oxc3pb{8I>_Db5M}y3fhcam*jJ6%l7A=Q+oEq3GE7*%cstOet7&w8nSi|4_jwbt+
hb@n^@?B8g?xZ%$U2Wbao#WR})>Ni>(XaSiI0s!$qrvU%}

literal 0
HcmV?d00001

diff --git a/backend/kernels/simple_float4_2.cl b/backend/kernels/simple_float4_2.cl
new file mode 100644
index 0000000..c35d9bb
--- /dev/null
+++ b/backend/kernels/simple_float4_2.cl
@@ -0,0 +1,8 @@
+#include "stdlib.h"
+
+__kernel void simple_float4(__global float4 *dst, __global float4 *src)
+{
+  dst[get_global_id(0)] = src[get_global_id(0)] * src[get_global_id(0)];
+}
+
+
diff --git a/backend/kernels/simple_float4_2.ll b/backend/kernels/simple_float4_2.ll
new file mode 100644
index 0000000..4f5e1da
--- /dev/null
+++ b/backend/kernels/simple_float4_2.ll
@@ -0,0 +1,22 @@
+; ModuleID = 'simple_float4_2.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_kernel void @simple_float4(<4 x float>* nocapture %dst, <4 x float>* nocapture %src) nounwind noinline {
+get_global_id.exit10:
+  %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+  %arrayidx = getelementptr inbounds <4 x float>* %src, i32 %call.i
+  %0 = load <4 x float>* %arrayidx, align 16, !tbaa !1
+  %mul = fmul <4 x float> %0, %0
+  %arrayidx4 = getelementptr inbounds <4 x float>* %dst, i32 %call.i
+  store <4 x float> %mul, <4 x float>* %arrayidx4, align 16, !tbaa !1
+  ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x float>*, <4 x float>*)* @simple_float4}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/simple_float4_2.o b/backend/kernels/simple_float4_2.o
new file mode 100644
index 0000000000000000000000000000000000000000..8580e1c1ee3c477382df94627bb86834124232b7
GIT binary patch
literal 600
zcmZ>AK5$Qwhk;=-0|Nu200V;%kY;aGp6Gaj#gk3I$Z`^63y&%*hol@!cT$6ZE0fbC
zZl@LjBM}P@2~RE~7Zne#35v=`f*N=v6^m5ZPjCn(tyr+a;n)X8pgEor3=AAVn!AUA
z#ZiETlMzIA{s#j20|5~Ve?Tk-HwK1I1qKE_pgJQZ4o-#CrWOV@7Y|NOHJ4+Y&Wf5U
zGXx~XIXGu31h_GzF|-MGG%VpUa5iygU{f{N)d<wZz#z<5v4Fj12D4>Gv)PReY)>z+
zeOSO=tiWDnz+QBL?PUUc$pZE=2lgr;;{w|=RPH+g_NoT<DgpK~h!&u%fs3XeBLfrA
z;DnB*1x{cuILNUvFo*+%oLiVAoK&2QbSmyGl)a-U`{1GMi-j_G8s!co$XrmAd6LL6
zQGj8C!mQSk4EBnI_9BMnz>4O;h-Om;<|u*I0FG9{bJho(Eq5HY*w8G!#@X^jv-K2a
z>obR^0L@NRU|^5|(!wkUI}R!waWR-@AP_Reh)2?2pyMFN6AcC)b|z3bg*7k)vLu`+
zxOqUqwTY9b+2rQI0|DKF5=<L(?1~tsIQs10Xu-JQ&xr@Z;zn1uD5^M!FiIH3STHae
zGB7X$^+&TQa%|&q1;#c6R089P9mq81XplMbP$o@;(YC|cqUCUpQv-Wt1$!|_RiOd{
l11FFcYxuk0(PY1}&VINx4$=<Fif1+p)Niyn&;l|a1OUd7sI>q9

literal 0
HcmV?d00001

diff --git a/backend/kernels/stdlib.h b/backend/kernels/stdlib.h
index bab425d..ca3b2d9 100644
--- a/backend/kernels/stdlib.h
+++ b/backend/kernels/stdlib.h
@@ -17,188 +17,31 @@
  * Author: Benjamin Segovia <benjamin.segovia@intel.com>
  */
 
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+__attribute__((pure,const)) unsigned int __gen_ocl_get_global_id0(void);
+__attribute__((pure,const)) unsigned int __gen_ocl_get_global_id1(void);
+__attribute__((pure,const)) unsigned int __gen_ocl_get_global_id2(void);
+__attribute__((pure,const)) unsigned int __gen_ocl_get_local_id0(void);
+__attribute__((pure,const)) unsigned int __gen_ocl_get_local_id1(void);
+__attribute__((pure,const)) unsigned int __gen_ocl_get_local_id2(void);
+
+inline unsigned get_global_id(unsigned int dim) {
+  if (dim == 0) return __gen_ocl_get_global_id0();
+  else if (dim == 1) return __gen_ocl_get_global_id1();
+  else if (dim == 2) return __gen_ocl_get_global_id2();
+  else return 0;
+}
+
+inline unsigned get_local_id(unsigned int dim) {
+  if (dim == 0) return __gen_ocl_get_local_id0();
+  else if (dim == 1) return __gen_ocl_get_local_id1();
+  else if (dim == 2) return __gen_ocl_get_local_id2();
+  else return 0;
+}
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef int int2 __attribute__((ext_vector_type(2)));
+typedef int int3 __attribute__((ext_vector_type(3)));
+typedef int int4 __attribute__((ext_vector_type(4)));
 
-/* Types */
-
-/* Standard types from Clang's stddef and stdint, Copyright (C) 2008 Eli Friedman */
-typedef signed __INT64_TYPE__ int64_t;
-typedef unsigned __INT64_TYPE__ uint64_t;
-typedef signed __INT32_TYPE__ int32_t;
-typedef unsigned __INT32_TYPE__ uint32_t;
-typedef signed __INT16_TYPE__ int16_t;
-typedef unsigned __INT16_TYPE__ uint16_t;
-typedef signed __INT8_TYPE__ int8_t;
-typedef unsigned __INT8_TYPE__ uint8_t;
-
-#define __stdint_join3(a,b,c) a ## b ## c
-#define  __intn_t(n) __stdint_join3( int, n, _t)
-#define __uintn_t(n) __stdint_join3(uint, n, _t)
-
-typedef __typeof__(((int*)0)-((int*)0)) ptrdiff_t;
-typedef __typeof__(sizeof(int)) size_t;
-typedef  __intn_t(__INTPTR_WIDTH__)  intptr_t;
-typedef __uintn_t(__INTPTR_WIDTH__) uintptr_t;
-
-/* OpenCL types */
-typedef uint8_t uchar;
-typedef uint16_t ushort;
-typedef uint32_t uint;
-typedef uint64_t ulong;
-
-typedef unsigned int sampler_t;
-typedef struct image2d *image2d_t;
-typedef struct image3d *image3d_t;
-
-/* Vectors */
-#define COAL_VECTOR(type, len)                                  \
-   typedef type type##len __attribute__((ext_vector_type(len)))
-#define COAL_VECTOR_SET(type) \
-   COAL_VECTOR(type, 2);      \
-   COAL_VECTOR(type, 3);      \
-   COAL_VECTOR(type, 4);      \
-   COAL_VECTOR(type, 8);      \
-   COAL_VECTOR(type, 16);
-
-COAL_VECTOR_SET(char)
-COAL_VECTOR_SET(uchar)
-
-COAL_VECTOR_SET(short)
-COAL_VECTOR_SET(ushort)
-
-COAL_VECTOR_SET(int)
-COAL_VECTOR_SET(uint)
-
-COAL_VECTOR_SET(long)
-COAL_VECTOR_SET(ulong)
-
-COAL_VECTOR_SET(float)
-
-#undef COAL_VECTOR_SET
-#undef COAL_VECTOR
-
-/* Address spaces */
-#define __private __attribute__((address_space(0)))
-#define __global __attribute__((address_space(1)))
-#define __local __attribute__((address_space(2)))
-#define __constant __attribute__((address_space(3)))
-
-#define global __global
-#define local __local
-#define constant __constant
-#define private __private
-
-#define __write_only
-#define __read_only const
-
-#define write_only __write_only
-#define read_only __read_only
-
-/* Defines */
-#define OVERLOAD __attribute__((overloadable))
-
-#define CLK_NORMALIZED_COORDS_FALSE 0x00000000
-#define CLK_NORMALIZED_COORDS_TRUE  0x00000001
-#define CLK_ADDRESS_NONE            0x00000000
-#define CLK_ADDRESS_MIRRORED_REPEAT 0x00000010
-#define CLK_ADDRESS_REPEAT          0x00000020
-#define CLK_ADDRESS_CLAMP_TO_EDGE   0x00000030
-#define CLK_ADDRESS_CLAMP           0x00000040
-#define CLK_FILTER_NEAREST          0x00000000
-#define CLK_FILTER_LINEAR           0x00000100
-
-#define CLK_LOCAL_MEM_FENCE         0x00000001
-#define CLK_GLOBAL_MEM_FENCE        0x00000002
-
-#define CLK_R                        0x10B0
-#define CLK_A                        0x10B1
-#define CLK_RG                       0x10B2
-#define CLK_RA                       0x10B3
-#define CLK_RGB                      0x10B4
-#define CLK_RGBA                     0x10B5
-#define CLK_BGRA                     0x10B6
-#define CLK_ARGB                     0x10B7
-#define CLK_INTENSITY                0x10B8
-#define CLK_LUMINANCE                0x10B9
-#define CLK_Rx                       0x10BA
-#define CLK_RGx                      0x10BB
-#define CLK_RGBx                     0x10BC
-
-#define CLK_SNORM_INT8               0x10D0
-#define CLK_SNORM_INT16              0x10D1
-#define CLK_UNORM_INT8               0x10D2
-#define CLK_UNORM_INT16              0x10D3
-#define CLK_UNORM_SHORT_565          0x10D4
-#define CLK_UNORM_SHORT_555          0x10D5
-#define CLK_UNORM_INT_101010         0x10D6
-#define CLK_SIGNED_INT8              0x10D7
-#define CLK_SIGNED_INT16             0x10D8
-#define CLK_SIGNED_INT32             0x10D9
-#define CLK_UNSIGNED_INT8            0x10DA
-#define CLK_UNSIGNED_INT16           0x10DB
-#define CLK_UNSIGNED_INT32           0x10DC
-#define CLK_HALF_FLOAT               0x10DD
-#define CLK_FLOAT                    0x10DE
-
-#define M_E            2.7182818284590452354   /* e */
-#define M_LOG2E        1.4426950408889634074   /* log_2 e */
-#define M_LOG10E       0.43429448190325182765  /* log_10 e */
-#define M_LN2          0.69314718055994530942  /* log_e 2 */
-#define M_LN10         2.30258509299404568402  /* log_e 10 */
-#define M_PI           3.14159265358979323846  /* pi */
-#define M_PI_2         1.57079632679489661923  /* pi/2 */
-#define M_PI_4         0.78539816339744830962  /* pi/4 */
-#define M_1_PI         0.31830988618379067154  /* 1/pi */
-#define M_2_PI         0.63661977236758134308  /* 2/pi */
-#define M_2_SQRTPI     1.12837916709551257390  /* 2/sqrt(pi) */
-#define M_SQRT2        1.41421356237309504880  /* sqrt(2) */
-#define M_SQRT1_2      0.70710678118654752440  /* 1/sqrt(2) */
-
-/* Typedefs */
-typedef unsigned int cl_mem_fence_flags;
-
-/* Management functions */
-uint get_work_dim();
-size_t get_global_size(uint dimindx);
-size_t get_global_id(uint dimindx);
-size_t get_local_size(uint dimindx);
-size_t get_local_id(uint dimindx);
-size_t get_num_groups(uint dimindx);
-size_t get_group_id(uint dimindx);
-size_t get_global_offset(uint dimindx);
-
-void barrier(cl_mem_fence_flags flags);
-
-/* Image functions */
-float4 OVERLOAD read_imagef(image2d_t image, sampler_t sampler, int2 coord);
-float4 OVERLOAD read_imagef(image3d_t image, sampler_t sampler, int4 coord);
-float4 OVERLOAD read_imagef(image2d_t image, sampler_t sampler, float2 coord);
-float4 OVERLOAD read_imagef(image3d_t image, sampler_t sampler, float4 coord);
-int4 OVERLOAD read_imagei(image2d_t image, sampler_t sampler, int2 coord);
-int4 OVERLOAD read_imagei(image3d_t image, sampler_t sampler, int4 coord);
-int4 OVERLOAD read_imagei(image2d_t image, sampler_t sampler, float2 coord);
-int4 OVERLOAD read_imagei(image3d_t image, sampler_t sampler, float4 coord);
-uint4 OVERLOAD read_imageui(image2d_t image, sampler_t sampler, int2 coord);
-uint4 OVERLOAD read_imageui(image3d_t image, sampler_t sampler, int4 coord);
-uint4 OVERLOAD read_imageui(image2d_t image, sampler_t sampler, float2 coord);
-uint4 OVERLOAD read_imageui(image3d_t image, sampler_t sampler, float4 coord);
-
-void OVERLOAD write_imagef(image2d_t image, int2 coord, float4 color);
-void OVERLOAD write_imagef(image3d_t image, int4 coord, float4 color);
-void OVERLOAD write_imagei(image2d_t image, int2 coord, int4 color);
-void OVERLOAD write_imagei(image3d_t image, int4 coord, int4 color);
-void OVERLOAD write_imageui(image2d_t image, int2 coord, uint4 color);
-void OVERLOAD write_imageui(image3d_t image, int4 coord, uint4 color);
-
-int2 OVERLOAD get_image_dim(image2d_t image);
-int4 OVERLOAD get_image_dim(image3d_t image);
-int OVERLOAD get_image_width(image2d_t image);
-int OVERLOAD get_image_width(image3d_t image);
-int OVERLOAD get_image_height(image2d_t image);
-int OVERLOAD get_image_height(image3d_t image);
-int OVERLOAD get_image_depth(image3d_t image);
-
-int OVERLOAD get_image_channel_data_type(image2d_t image);
-int OVERLOAD get_image_channel_data_type(image3d_t image);
-int OVERLOAD get_image_channel_order(image2d_t image);
-int OVERLOAD get_image_channel_order(image3d_t image);
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
index 4911d55..ffa489e 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -54,6 +54,11 @@ namespace ir {
     void endFunction(void);
     /*! Create a new register with the given family for the current function */
     Register reg(RegisterData::Family family);
+    /*! Create a new immediate value */
+    template <typename T> INLINE ImmediateIndex newImmediate(T value) {
+      const Immediate imm(value);
+      return fn->newImmediate(imm);
+    }
     /*! Create a new register holding the given value. A LOADI is pushed */
     template <typename T> INLINE Register immReg(T value) {
       GBE_ASSERTM(fn != NULL, "No function currently defined");
@@ -70,6 +75,10 @@ namespace ir {
     void input(Register reg);
     /*! Append a new output register for the function */
     void output(Register reg);
+    /*! Get the immediate value */
+    INLINE Immediate getImmediate(ImmediateIndex index) const {
+      return fn->getImmediate(index);
+    }
     /*! Get the current processed function */
     Function &getFunction(void);
     /*! Get the current processed unit */
@@ -79,8 +88,13 @@ namespace ir {
       GBE_ASSERTM(fn != NULL, "No function currently defined");
       return fn->file.appendTuple(args...);
     }
+    /*! Make a tuple from an array of register */
+    INLINE Tuple arrayTuple(const Register *reg, uint32_t regNum) {
+      GBE_ASSERTM(fn != NULL, "No function currently defined");
+      return fn->file.appendArrayTuple(reg, regNum);
+    }
     /*! We just use variadic templates to forward instruction functions */
-#define DECL_INSN(NAME, FAMILY)                                       \
+#define DECL_INSN(NAME, FAMILY)                                 \
     template <typename... Args> INLINE void NAME(Args...args);
 #include "ir/instruction.hxx"
 #undef DECL_INSN
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 8c2cc82..8ef0c0d 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -97,7 +97,7 @@ namespace ir {
     /*! Get the register file */
     INLINE const RegisterFile &getRegisterFile(void) const { return file; }
     /*! Get the given value ie immediate from the function */
-    INLINE Immediate getImmediate(uint32_t ID) const {
+    INLINE Immediate getImmediate(ImmediateIndex ID) const {
       GBE_ASSERT(ID < immediateNum());
       return immediates[ID];
     }
diff --git a/backend/src/ir/register.cpp b/backend/src/ir/register.cpp
index 25e20ba..fa00f83 100644
--- a/backend/src/ir/register.cpp
+++ b/backend/src/ir/register.cpp
@@ -50,6 +50,15 @@ namespace ir {
     return out;
   }
 
+  Tuple RegisterFile::appendArrayTuple(const Register *reg, uint32_t regNum) {
+    const Tuple index = Tuple(regTuples.size());
+    for (uint32_t regID = 0; regID < regNum; ++regID) {
+      GBE_ASSERTM(reg[regID] < this->regNum(), "Out-of-bound register");
+      regTuples.push_back(reg[regID]);
+    }
+    return index;
+  }
+
 } /* namespace ir */
 } /* namespace gbe */
 
diff --git a/backend/src/ir/register.hpp b/backend/src/ir/register.hpp
index b2acfdd..94d90e3 100644
--- a/backend/src/ir/register.hpp
+++ b/backend/src/ir/register.hpp
@@ -87,6 +87,8 @@ namespace ir {
       regs.push_back(reg);
       return Register(index);
     }
+    /*! Make a tuple from an array of register */
+    Tuple appendArrayTuple(const Register *reg, uint32_t regNum);
     /*! Make a tuple and return the index to the first element of the tuple */
     template <typename First, typename... Rest>
     INLINE Tuple appendTuple(First first, Rest... rest) {
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 3cff620..84ca795 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -17,6 +17,12 @@
  * Author: Benjamin Segovia <benjamin.segovia@intel.com>
  */
 
+/**
+ * \file llvm_gen_backend.cpp
+ * \author Benjamin Segovia <benjamin.segovia@intel.com>
+ *
+ * Transform the LLVM IR code into Gen IR code
+ */
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
@@ -69,6 +75,171 @@ using namespace llvm;
 
 namespace gbe
 {
+  /*! Gen IR manipulates only scalar types */
+  static bool isScalarType(const Type *type)
+  {
+    return type->isFloatTy()   ||
+           type->isIntegerTy() ||
+           type->isDoubleTy()  ||
+           type->isPointerTy();
+  }
+
+  /*! LLVM IR Type to Gen IR type translation */
+  static ir::Type getType(const ir::Context &ctx, const Type *type)
+  {
+    GBE_ASSERT(isScalarType(type));
+    if (type->isFloatTy() == true)
+      return ir::TYPE_FLOAT;
+    if (type->isDoubleTy() == true)
+      return ir::TYPE_DOUBLE;
+    if (type->isPointerTy() == true) {
+      if (ctx.getPointerSize() == ir::POINTER_32_BITS)
+        return ir::TYPE_U32;
+      else
+        return ir::TYPE_U64;
+    }
+    GBE_ASSERT(type->isIntegerTy() == true);
+    if (type == Type::getInt1Ty(type->getContext()))
+      return ir::TYPE_BOOL;
+    if (type == Type::getInt8Ty(type->getContext()))
+      return ir::TYPE_S8;
+    if (type == Type::getInt16Ty(type->getContext()))
+      return ir::TYPE_S16;
+    if (type == Type::getInt32Ty(type->getContext()))
+      return ir::TYPE_S32;
+    if (type == Type::getInt64Ty(type->getContext()))
+      return ir::TYPE_S64;
+    GBE_ASSERT(0);
+    return ir::TYPE_S64;
+  }
+
+  /*! Type to register family translation */
+  static ir::RegisterData::Family getFamily(const ir::Context &ctx, const Type *type)
+  {
+    GBE_ASSERT(isScalarType(type) == true); 
+    if (type == Type::getInt1Ty(type->getContext()))
+      return ir::RegisterData::BOOL;
+    if (type == Type::getInt8Ty(type->getContext()))
+      return ir::RegisterData::BYTE;
+    if (type == Type::getInt16Ty(type->getContext()))
+      return ir::RegisterData::WORD;
+    if (type == Type::getInt32Ty(type->getContext()) || type->isFloatTy())
+      return ir::RegisterData::DWORD;
+    if (type == Type::getInt64Ty(type->getContext()) || type->isDoubleTy())
+      return ir::RegisterData::QWORD;
+    if (type->isPointerTy() && ctx.getPointerSize() == ir::POINTER_32_BITS)
+      return ir::RegisterData::DWORD;
+    if (type->isPointerTy() && ctx.getPointerSize() == ir::POINTER_64_BITS)
+      return ir::RegisterData::QWORD;
+    GBE_ASSERT(0);
+    return ir::RegisterData::BOOL;
+  }
+
+  /*! Handle the LLVM IR Value to Gen IR register translation. This has 2 roles:
+   *  - Split the LLVM vector into several scalar values
+   *  - Handle the transparent copies (bitcast or use of intrincics functions
+   *    like get_local_id / get_global_id
+   */
+  class RegisterTranslator
+  {
+  public:
+    RegisterTranslator(ir::Context &ctx) : ctx(ctx) {}
+
+    /*! Empty the maps */
+    void clear(void) {
+      valueMap.clear();
+      scalarMap.clear();
+    }
+    /*! Some values will not be allocated. For example, a bit-cast destination
+     *  like: %fake = bitcast %real or a vector insertion since we do not have
+     *  vectors in Gen-IR
+     */
+    void newValueProxy(Value *real,
+                       Value *fake,
+                       uint32_t realIndex = 0u,
+                       uint32_t fakeIndex = 0u) {
+      const ValueIndex key(fake, fakeIndex);
+      const ValueIndex value(real, realIndex);
+      GBE_ASSERT(valueMap.find(key) == valueMap.end()); // Do not insert twice
+      valueMap[key] = value;
+    }
+    /*! Mostly used for the preallocated registers (lids, gids) */
+    void newScalarProxy(ir::Register reg, Value *value, uint32_t index = 0u) {
+      const ValueIndex key(value, index);
+      GBE_ASSERT(scalarMap.find(key) == scalarMap.end());
+      scalarMap[key] = reg;
+    }
+    /*! Allocate a new scalar register */
+    ir::Register newScalar(Value *value, uint32_t index = 0u)
+    {
+      GBE_ASSERT(dyn_cast<Constant>(value) == NULL);
+      Type *type = value->getType();
+      auto typeID = type->getTypeID();
+      switch (typeID) {
+        case Type::IntegerTyID:
+        case Type::FloatTyID:
+        case Type::DoubleTyID:
+        case Type::PointerTyID:
+          GBE_ASSERT(index == 0);
+          return this->newScalar(value, type, index);
+          break;
+        case Type::VectorTyID:
+        {
+          auto vectorType = cast<VectorType>(type);
+          auto elementType = vectorType->getElementType();
+          auto elementTypeID = elementType->getTypeID();
+          if (elementTypeID != Type::IntegerTyID &&
+              elementTypeID != Type::FloatTyID &&
+              elementTypeID != Type::DoubleTyID)
+            GBE_ASSERTM(false, "Vectors of elements are not supported");
+            return this->newScalar(value, elementType, index);
+          break;
+        }
+        default: NOT_SUPPORTED;
+      };
+      return ir::Register();
+    }
+    /*! Get the register from the given value at given index possibly iterating
+     *  in the value map to get the final real register
+     */
+    ir::Register getScalar(Value *value, uint32_t index = 0u) {
+      auto end = valueMap.end();
+      for (;;) {
+        auto it = valueMap.find(std::make_pair(value, index));
+        if (it == end)
+          break;
+        else {
+          value = it->second.first;
+          index = it->second.second;
+        }
+      }
+      const auto key = std::make_pair(value, index);
+      GBE_ASSERT(scalarMap.find(key) != scalarMap.end());
+      return scalarMap[key];
+    }
+
+  private:
+    /*! This maps a scalar register to a Value (index is the vector index when
+     *  the value is a vector of scalars)
+     */
+    ir::Register newScalar(Value *value, Type *type, uint32_t index) {
+      const auto key = std::make_pair(value, index);
+      GBE_ASSERT(scalarMap.find(key) == scalarMap.end());
+      const ir::RegisterData::Family family = getFamily(ctx, type);
+      const ir::Register reg = ctx.reg(family);
+      scalarMap[key] = reg;
+      return reg;
+    }
+    /*! Indices will be zero for scalar values */
+    typedef std::pair<Value*, uint32_t> ValueIndex;
+    /*! Map value to ir::Register */
+    map<ValueIndex, ir::Register> scalarMap;
+    /*! Map values to values when this is only a translation (eq bitcast) */
+    map<ValueIndex, ValueIndex> valueMap;
+    /*! Actually allocates the registers */
+    ir::Context &ctx;
+  };
+
   class CBEMCAsmInfo : public MCAsmInfo {
   public:
     CBEMCAsmInfo() {
@@ -77,30 +248,36 @@ namespace gbe
     }
   };
 
-  /// GenWriter - This class is the main chunk of code that converts an LLVM
-  /// module to a C translation unit.
+  /*! Translate LLVM IR code to Gen IR code */
   class GenWriter : public FunctionPass, public InstVisitor<GenWriter>
   {
+    /*! Unit to compute */
     ir::Unit &unit;
+    /*! Helper structure to compute the unit */
     ir::Context ctx;
+    /*! Make the LLVM-to-Gen translation */
+    RegisterTranslator regTranslator;
+    /*! Map value to ir::LabelIndex */
+    map<const Value*, ir::LabelIndex> labelMap;
+    /*! We visit each function twice. Once to allocate the registers and once to
+     *  emit the Gen IR instructions 
+     */
+    enum Pass {
+      PASS_EMIT_REGISTERS = 0,
+      PASS_EMIT_INSTRUCTIONS = 1
+    } pass;
+
     std::string FDOutErr;
     tool_output_file *FDOut;
     formatted_raw_ostream Out;
-    IntrinsicLowering *IL;
     Mangler *Mang;
     LoopInfo *LI;
     const Module *TheModule;
+    const MCObjectFileInfo *MOFI;
+    const TargetData* TD;
     const MCAsmInfo* TAsm;
     const MCRegisterInfo *MRI;
-    const MCObjectFileInfo *MOFI;
     MCContext *TCtx;
-    const TargetData* TD;
-
-    /*! Map value to ir::Register*/
-    map<const Value*, ir::Register> registerMap;
-
-    /*! Map value to ir::LabelIndex */
-    map<const Value*, ir::LabelIndex> labelMap;
 
     std::map<const ConstantFP *, unsigned> FPConstantMap;
     std::set<Function*> intrinsicPrototypesAlreadyGenerated;
@@ -120,14 +297,16 @@ namespace gbe
       : FunctionPass(ID),
         unit(unit),
         ctx(unit),
+        regTranslator(ctx),
         FDOut(new llvm::tool_output_file("-", FDOutErr, 0)),
         Out(FDOut->os()),
-        IL(0), Mang(0), LI(0),
-        TheModule(0), TAsm(0), MRI(0), MOFI(0), TCtx(0), TD(0),
+        Mang(0), LI(0),
+        TheModule(0), MOFI(0), TD(0),
         OpaqueCounter(0), NextAnonValueNumber(0)
     {
       initializeLoopInfoPass(*PassRegistry::getPassRegistry());
       FPCounter = 0;
+      pass = PASS_EMIT_REGISTERS;
     }
 
     virtual const char *getPassName() const { return "Gen Back-End"; }
@@ -156,12 +335,8 @@ namespace gbe
 
     virtual bool doFinalization(Module &M) {
       // Free memory...
-      delete IL;
       delete TD;
       delete Mang;
-      delete TCtx;
-      delete TAsm;
-      delete MRI;
       delete MOFI;
       FPConstantMap.clear();
       ByValParams.clear();
@@ -203,43 +378,30 @@ namespace gbe
     void writeOperand(Value *Operand, bool Static = false);
     void writeInstComputationInline(Instruction &I);
     void writeOperandInternal(Value *Operand, bool Static = false);
-    void writeOperandWithCast(Value* Operand, unsigned Opcode);
-    void writeOperandWithCast(Value* Operand, const ICmpInst &I);
-    bool writeInstructionCast(const Instruction &I);
-
-  private :
 
     /// Prints the definition of the intrinsic function F. Supports the 
     /// intrinsics which need to be explicitly defined in the CBackend.
     void printIntrinsicDefinition(const Function &F, raw_ostream &Out);
 
-    void printModuleTypes();
     void printContainedStructs(Type *Ty, SmallPtrSet<Type *, 16> &);
     void printFloatingPointConstants(Function &F);
     void printFloatingPointConstants(const Constant *C);
-    void emitFunctionSignature(const Function *F, bool Prototype);
 
     /*! Emit the complete function code and declaration */
     void emitFunction(Function &F);
     /*! Handle input and output function parameters */
-    void emitFunctionPrototype(const Function *F);
+    void emitFunctionPrototype(Function &F);
     /*! Emit the code for a basic block */
     void emitBasicBlock(BasicBlock *BB);
 
-    /*! Get the register family from the given type */
-    INLINE ir::RegisterData::Family getArgumentFamily(const Type*) const;
-    /*! Insert a new register when this is a scalar value */
-    INLINE ir::Register newRegister(const Value *value);
+    /*! Alocate one or several registers (if vector) for the value */
+    INLINE void newRegister(Value *value);
     /*! Return a valid register from an operand (can use LOADI to make one) */
-    INLINE ir::Register getRegister(Value *value);
-    /*! Return a valid register for a constant value */
-    INLINE ir::Register getConstantRegister(Constant *CPV);
+    INLINE ir::Register getRegister(Value *value, uint32_t index = 0);
+    /*! Create a new immediate from a constant */
+    ir::ImmediateIndex newImmediate(Constant *CPV);
     /*! Insert a new label index when this is a scalar value */
     INLINE void newLabelIndex(const Value *value);
-    /*! int / float / double / bool are scalars */
-    INLINE bool isScalarType(const Type *type) const;
-    /*! Get the Gen IR type from the LLVM type */
-    INLINE ir::Type getType(const Type *type) const;
 
     void printBasicBlock(BasicBlock *BB);
 
@@ -312,16 +474,27 @@ namespace gbe
       return false;
     }
 
-    // Instruction visitation functions
-    friend class InstVisitor<GenWriter>;
+    /*! Helper function to emit loads and stores */
+    template <bool isLoad, typename T> void emitLoadOrStore(T &I);
 
     // Currently supported instructions
-    void visitBinaryOperator(Instruction &I);
-    void visitReturnInst(ReturnInst &I);
-    void visitLoadInst(LoadInst &I);
-    void visitStoreInst(StoreInst &I);
-    void visitCallInst (CallInst &I);
-    bool visitBuiltinCall(CallInst &I, Intrinsic::ID ID, bool &WroteCallee);
+#define DECL_VISIT_FN(NAME, TYPE)         \
+    void regAllocate##NAME(TYPE &I);      \
+    void emit##NAME(TYPE &I);             \
+    void visit##NAME(TYPE &I) {           \
+      if (pass == PASS_EMIT_INSTRUCTIONS) \
+        emit##NAME(I);                    \
+      else                                \
+        regAllocate##NAME(I);             \
+    }
+    DECL_VISIT_FN(BinaryOperator, Instruction);
+    DECL_VISIT_FN(CastInst, CastInst);
+    DECL_VISIT_FN(ReturnInst, ReturnInst);
+    DECL_VISIT_FN(LoadInst, LoadInst);
+    DECL_VISIT_FN(StoreInst, StoreInst);
+    DECL_VISIT_FN(CallInst, CallInst);
+
+#undef DECL_VISIT_FN
 
     // Must be implemented later
     void visitInsertElementInst(InsertElementInst &I) {NOT_SUPPORTED;}
@@ -333,7 +506,6 @@ namespace gbe
     void visitBranchInst(BranchInst &I) {NOT_SUPPORTED;}
     void visitICmpInst(ICmpInst &I) {NOT_SUPPORTED;}
     void visitFCmpInst(FCmpInst &I) {NOT_SUPPORTED;}
-    void visitCastInst (CastInst &I);
     void visitSelectInst(SelectInst &I) {NOT_SUPPORTED;}
 
     // These instructions are not supported at all
@@ -1354,122 +1526,6 @@ static std::string CBEMangle(const std::string &S) {
       Out << ')';
   }
 
-  // Some instructions need to have their result value casted back to the
-  // original types because their operands were casted to the expected type.
-  // This function takes care of detecting that case and printing the cast
-  // for the Instruction.
-  bool GenWriter::writeInstructionCast(const Instruction &I) {
-    Type *Ty = I.getOperand(0)->getType();
-    switch (I.getOpcode()) {
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::Mul:
-      // We need to cast integer arithmetic so that it is always performed
-      // as unsigned, to avoid undefined behavior on overflow.
-    case Instruction::LShr:
-    case Instruction::URem:
-    case Instruction::UDiv:
-      Out << "((";
-      printSimpleType(Out, Ty, false);
-      Out << ")(";
-      return true;
-    case Instruction::AShr:
-    case Instruction::SRem:
-    case Instruction::SDiv:
-      Out << "((";
-      printSimpleType(Out, Ty, true);
-      Out << ")(";
-      return true;
-    default: break;
-    }
-    return false;
-  }
-
-  // Write the operand with a cast to another type based on the Opcode being used.
-  // This will be used in cases where an instruction has specific type
-  // requirements (usually signedness) for its operands.
-  void GenWriter::writeOperandWithCast(Value* Operand, unsigned Opcode) {
-
-    // Extract the operand's type, we'll need it.
-    Type* OpTy = Operand->getType();
-
-    // Indicate whether to do the cast or not.
-    bool shouldCast = false;
-
-    // Indicate whether the cast should be to a signed type or not.
-    bool castIsSigned = false;
-
-    // Based on the Opcode for which this Operand is being written, determine
-    // the new type to which the operand should be casted by setting the value
-    // of OpTy. If we change OpTy, also set shouldCast to true.
-    switch (Opcode) {
-      default:
-        // for most instructions, it doesn't matter
-        break;
-      case Instruction::Add:
-      case Instruction::Sub:
-      case Instruction::Mul:
-        // We need to cast integer arithmetic so that it is always performed
-        // as unsigned, to avoid undefined behavior on overflow.
-      case Instruction::LShr:
-      case Instruction::UDiv:
-      case Instruction::URem: // Cast to unsigned first
-        shouldCast = true;
-        castIsSigned = false;
-        break;
-      case Instruction::GetElementPtr:
-      case Instruction::AShr:
-      case Instruction::SDiv:
-      case Instruction::SRem: // Cast to signed first
-        shouldCast = true;
-        castIsSigned = true;
-        break;
-    }
-
-    // Write out the casted operand if we should, otherwise just write the
-    // operand.
-    if (shouldCast) {
-      Out << "((";
-      printSimpleType(Out, OpTy, castIsSigned);
-      Out << ")";
-      writeOperand(Operand);
-      Out << ")";
-    } else
-      writeOperand(Operand);
-  }
-
-  // Write the operand with a cast to another type based on the icmp predicate
-  // being used.
-  void GenWriter::writeOperandWithCast(Value* Operand, const ICmpInst &Cmp) {
-    // This has to do a cast to ensure the operand has the right signedness.
-    // Also, if the operand is a pointer, we make sure to cast to an integer when
-    // doing the comparison both for signedness and so that the C compiler doesn't
-    // optimize things like "p < NULL" to false (p may contain an integer value
-    // f.e.).
-    bool shouldCast = Cmp.isRelational();
-
-    // Write out the casted operand if we should, otherwise just write the
-    // operand.
-    if (!shouldCast) {
-      writeOperand(Operand);
-      return;
-    }
-
-    // Should this be a signed comparison?  If so, convert to signed.
-    bool castIsSigned = Cmp.isSigned();
-
-    // If the operand was a pointer, convert to a large integer type.
-    Type* OpTy = Operand->getType();
-    if (OpTy->isPointerTy())
-      OpTy = TD->getIntPtrType(Operand->getContext());
-
-    Out << "((";
-    printSimpleType(Out, OpTy, castIsSigned);
-    Out << ")";
-    writeOperand(Operand);
-    Out << ")";
-  }
-
   enum SpecialGlobalClass {
     NotSpecial = 0,
     GlobalCtors, GlobalDtors,
@@ -1482,14 +1538,12 @@ static std::string CBEMangle(const std::string &S) {
     // Initialize
     TheModule = &M;
 
-    TD = new TargetData(&M);
-    IL = new IntrinsicLowering(*TD);
-    IL->AddPrototypes(M);
-
     TAsm = new CBEMCAsmInfo();
+    TD = new TargetData(&M);
     MRI  = new MCRegisterInfo();
     TCtx = new MCContext(*TAsm, *MRI, NULL);
     Mang = new Mangler(*TCtx, *TD);
+
     return false;
   }
 
@@ -1561,57 +1615,6 @@ static std::string CBEMangle(const std::string &S) {
     }
   }
 
-
-  /// printSymbolTable - Run through symbol table looking for type names.  If a
-  /// type name is found, emit its declaration...
-  ///
-  void GenWriter::printModuleTypes() {
-    Out << "/* Helper union for bitcasts */\n";
-    Out << "typedef union {\n";
-    Out << "  unsigned int Int32;\n";
-    Out << "  unsigned long long Int64;\n";
-    Out << "  float Float;\n";
-    Out << "  double Double;\n";
-    Out << "} llvmBitCastUnion;\n";
-
-    // Get all of the struct types used in the module.
-    std::vector<StructType*> StructTypes;
-    TheModule->findUsedStructTypes(StructTypes);
-
-    if (StructTypes.empty()) return;
-
-    Out << "/* Structure forward decls */\n";
-
-    unsigned NextTypeID = 0;
-    
-    // If any of them are missing names, add a unique ID to UnnamedStructIDs.
-    // Print out forward declarations for structure types.
-    for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
-      StructType *ST = StructTypes[i];
-
-      if (ST->isLiteral() || ST->getName().empty())
-        UnnamedStructIDs[ST] = NextTypeID++;
-
-      std::string Name = getStructName(ST);
-
-      Out << "typedef struct " << Name << ' ' << Name << ";\n";
-    }
-
-    Out << '\n';
-
-    // Keep track of which structures have been printed so far.
-    SmallPtrSet<Type *, 16> StructPrinted;
-
-    // Loop over all structures then push them into the stack so they are
-    // printed in the correct order.
-    //
-    Out << "/* Structure contents */\n";
-    for (unsigned i = 0, e = StructTypes.size(); i != e; ++i)
-      if (StructTypes[i]->isStructTy())
-        // Only print out used types!
-        printContainedStructs(StructTypes[i], StructPrinted);
-  }
-
   // Push the struct onto the stack and recursively push all structs
   // this one depends on.
   //
@@ -1638,131 +1641,90 @@ static std::string CBEMangle(const std::string &S) {
     }
   }
 
-  INLINE bool GenWriter::isScalarType(const Type *type) const
-  {
-    return type->isFloatTy() ||
-           type->isIntegerTy() ||
-           type->isDoubleTy() ||
-           type->isPointerTy();
-  }
-
-  INLINE ir::Type GenWriter::getType(const Type *type) const
-  {
-    GBE_ASSERT(this->isScalarType(type));
-    if (type->isFloatTy() == true)
-      return ir::TYPE_FLOAT;
-    if (type->isDoubleTy() == true)
-      return ir::TYPE_DOUBLE;
-    if (type->isPointerTy() == true) {
-      if (ctx.getPointerSize() == ir::POINTER_32_BITS)
-        return ir::TYPE_U32;
-      else
-        return ir::TYPE_U64;
-    }
-    GBE_ASSERT(type->isIntegerTy() == true);
-    if (type == Type::getInt1Ty(type->getContext()))
-      return ir::TYPE_BOOL;
-    if (type == Type::getInt8Ty(type->getContext()))
-      return ir::TYPE_S8;
-    if (type == Type::getInt16Ty(type->getContext()))
-      return ir::TYPE_S16;
-    if (type == Type::getInt32Ty(type->getContext()))
-      return ir::TYPE_S32;
-    if (type == Type::getInt64Ty(type->getContext()))
-      return ir::TYPE_S64;
-    GBE_ASSERT(0);
-    return ir::TYPE_S64;
-  }
-
-  INLINE ir::RegisterData::Family GenWriter::getArgumentFamily(const Type *type) const
-  {
-    GBE_ASSERT(this->isScalarType(type) == true); 
-    if (type == Type::getInt1Ty(type->getContext()))
-      return ir::RegisterData::BOOL;
-    if (type == Type::getInt8Ty(type->getContext()))
-      return ir::RegisterData::BYTE;
-    if (type == Type::getInt16Ty(type->getContext()))
-      return ir::RegisterData::WORD;
-    if (type == Type::getInt32Ty(type->getContext()) || type->isFloatTy())
-      return ir::RegisterData::DWORD;
-    if (type == Type::getInt64Ty(type->getContext()) || type->isDoubleTy())
-      return ir::RegisterData::QWORD;
-    if (type->isPointerTy() && ctx.getPointerSize() == ir::POINTER_32_BITS)
-      return ir::RegisterData::DWORD;
-    if (type->isPointerTy() && ctx.getPointerSize() == ir::POINTER_64_BITS)
-      return ir::RegisterData::QWORD;
-    GBE_ASSERT(0);
-    return ir::RegisterData::BOOL;
-  }
-
-  ir::Register GenWriter::newRegister(const Value *value) {
-    auto it = registerMap.find(value);
-    if (it == registerMap.end()) {
-      const Type *type = value->getType();
-      const ir::RegisterData::Family family = getArgumentFamily(type);
-      const ir::Register reg = ctx.reg(family);
-      registerMap[value] = reg;
-      return reg;
-    } else
-      return it->second;
-  }
-
-  ir::Register GenWriter::getConstantRegister(Constant *CPV) {
+  ir::ImmediateIndex GenWriter::newImmediate(Constant *CPV) {
     if (dyn_cast<ConstantExpr>(CPV))
       GBE_ASSERTM(false, "Unsupported constant expression");
     else if (isa<UndefValue>(CPV) && CPV->getType()->isSingleValueType())
       GBE_ASSERTM(false, "Unsupported constant expression");
+
+    // Integers
     if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
       Type* Ty = CI->getType();
       if (Ty == Type::getInt1Ty(CPV->getContext())) {
         const bool b = CI->getZExtValue();
-        return ctx.immReg(b);
+        return ctx.newImmediate(b);
       } else if (Ty == Type::getInt8Ty(CPV->getContext())) {
         const uint8_t u8 = CI->getZExtValue();
-        return ctx.immReg(u8);
+        return ctx.newImmediate(u8);
       } else if (Ty == Type::getInt16Ty(CPV->getContext())) {
         const uint16_t u16 = CI->getZExtValue();
-        return ctx.immReg(u16);
+        return ctx.newImmediate(u16);
       } else if (Ty == Type::getInt32Ty(CPV->getContext())) {
         const uint32_t u32 = CI->getZExtValue();
-        return ctx.immReg(u32);
+        return ctx.newImmediate(u32);
       } else if (Ty == Type::getInt64Ty(CPV->getContext())) {
         const uint64_t u64 = CI->getZExtValue();
-        return ctx.immReg(u64);
+        return ctx.newImmediate(u64);
       } else {
         GBE_ASSERTM(false, "Unsupported integer size");
-        return ctx.immReg(uint64_t(0));
+        return ctx.newImmediate(uint64_t(0));
       }
     }
 
+    // Floats and doubles
     switch (CPV->getType()->getTypeID()) {
-    case Type::FloatTyID:
-    case Type::DoubleTyID:
-    {
-      ConstantFP *FPC = cast<ConstantFP>(CPV);
-      if (FPC->getType() == Type::getFloatTy(CPV->getContext())) {
-        const float f32 = FPC->getValueAPF().convertToFloat();
-        return ctx.immReg(f32);
-      } else {
-        const double f64 = FPC->getValueAPF().convertToDouble();
-        return ctx.immReg(f64);
+      case Type::FloatTyID:
+      case Type::DoubleTyID:
+      {
+        ConstantFP *FPC = cast<ConstantFP>(CPV);
+        if (FPC->getType() == Type::getFloatTy(CPV->getContext())) {
+          const float f32 = FPC->getValueAPF().convertToFloat();
+          return ctx.newImmediate(f32);
+        } else {
+          const double f64 = FPC->getValueAPF().convertToDouble();
+          return ctx.newImmediate(f64);
+        }
       }
+      break;
+      default:
+        GBE_ASSERTM(false, "Unsupported constant type");
     }
-    break;
-    default:
-      GBE_ASSERTM(false, "Unsupported constant type");
-    }
-    return ctx.immReg(uint64_t(0));
+    return ctx.newImmediate(uint64_t(0));
   }
 
-  ir::Register GenWriter::getRegister(Value *value) {
+  void GenWriter::newRegister(Value *value) {
+    auto type = value->getType();
+    auto typeID = type->getTypeID();
+    switch (typeID) {
+      case Type::IntegerTyID:
+      case Type::FloatTyID:
+      case Type::DoubleTyID:
+      case Type::PointerTyID:
+        regTranslator.newScalar(value);
+        break;
+      case Type::VectorTyID:
+      {
+        auto vectorType = cast<VectorType>(type);
+        const uint32_t elemNum = vectorType->getNumElements();
+        for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+          regTranslator.newScalar(value, elemID);
+        break;
+      }
+      default: NOT_SUPPORTED;
+    };
+  }
+
+  ir::Register GenWriter::getRegister(Value *value, uint32_t index) {
     Constant *CPV = dyn_cast<Constant>(value);
-    if (CPV && !isa<GlobalValue>(CPV))
-      return getConstantRegister(CPV);
-    else {
-      GBE_ASSERT(this->registerMap.find(value) != this->registerMap.end());
-      return this->registerMap[value];
+    if (CPV && !isa<GlobalValue>(CPV)) {
+      const ir::ImmediateIndex index = this->newImmediate(CPV);
+      const ir::Immediate imm = ctx.getImmediate(index);
+      const ir::Register reg = ctx.reg(getFamily(imm.type));
+      ctx.LOADI(imm.type, reg, index);
+      return reg;
     }
+    else
+      return regTranslator.getScalar(value, index);
   }
 
   void GenWriter::newLabelIndex(const Value *value) {
@@ -1787,23 +1749,23 @@ static std::string CBEMangle(const std::string &S) {
     }
   }
 
-  void GenWriter::emitFunctionPrototype(const Function *F)
+  void GenWriter::emitFunctionPrototype(Function &F)
   {
-    const bool returnStruct = F->hasStructRetAttr();
+    const bool returnStruct = F.hasStructRetAttr();
 
     // Loop over the arguments and output registers for them
-    if (!F->arg_empty()) {
-      Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+    if (!F.arg_empty()) {
+      Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
 
       // When a struct is returned, first argument is pointer to the structure
-      if (returnStruct) {
-        ir::Function &fn = ctx.getFunction();
-        fn.setStructReturned(true);
-      }
+      if (returnStruct)
+        ctx.getFunction().setStructReturned(true);
 
-      // Insert a new register if we need to
+      // Insert a new register for each function argument
       for (; I != E; ++I) {
-        const ir::Register reg = this->newRegister(I);
+        const Type *type = I->getType();
+        GBE_ASSERT(isScalarType(type) == true);
+        const ir::Register reg = regTranslator.newScalar(I);
         ctx.input(reg);
       }
     }
@@ -1811,9 +1773,9 @@ static std::string CBEMangle(const std::string &S) {
     // When returning a structure, first input register is the pointer to the
     // structure
     if (!returnStruct) {
-      const Type *type = F->getReturnType();
+      const Type *type = F.getReturnType();
       if (type->isVoidTy() == false) {
-        const ir::RegisterData::Family family = getArgumentFamily(type);
+        const ir::RegisterData::Family family = getFamily(ctx, type);
         const ir::Register reg = ctx.reg(family);
         ctx.output(reg);
       }
@@ -1821,120 +1783,11 @@ static std::string CBEMangle(const std::string &S) {
 
 #if GBE_DEBUG
     // Variable number of arguments is not supported
-    FunctionType *FT = cast<FunctionType>(F->getFunctionType());
+    FunctionType *FT = cast<FunctionType>(F.getFunctionType());
     GBE_ASSERT(FT->isVarArg() == false);
 #endif /* GBE_DEBUG */
   }
 
-  void GenWriter::emitFunctionSignature(const Function *F, bool Prototype)
-  {
-    /// isStructReturn - Should this function actually return a struct by-value?
-    bool isStructReturn = F->hasStructRetAttr();
-
-    // Loop over the arguments, printing them...
-    FunctionType *FT = cast<FunctionType>(F->getFunctionType());
-    const AttrListPtr &PAL = F->getAttributes();
-
-    std::string tstr;
-    raw_string_ostream FunctionInnards(tstr);
-
-    // Print out the name...
-    FunctionInnards << GetValueName(F) << '(';
-
-    bool PrintedArg = false;
-    if (!F->isDeclaration()) {
-      if (!F->arg_empty()) {
-        Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-        unsigned Idx = 1;
-
-        // If this is a struct-return function, don't print the hidden
-        // struct-return argument.
-        if (isStructReturn) {
-          assert(I != E && "Invalid struct return function!");
-          ++I;
-          ++Idx;
-        }
-
-        std::string ArgName;
-        for (; I != E; ++I) {
-          if (PrintedArg) FunctionInnards << ", ";
-          if (I->hasName() || !Prototype) {
-            ArgName = GetValueName(I);
-          } else {
-            GBE_ASSERT(0);
-            ArgName = "";
-          }
-          Type *ArgTy = I->getType();
-          if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
-            ArgTy = cast<PointerType>(ArgTy)->getElementType();
-            ByValParams.insert(I);
-          }
-          printType(FunctionInnards, ArgTy,
-              /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt),
-              ArgName);
-          PrintedArg = true;
-          ++Idx;
-        }
-      }
-    } else {
-      GBE_ASSERT(0);
-
-      // Loop over the arguments, printing them.
-      FunctionType::param_iterator I = FT->param_begin(), E = FT->param_end();
-      unsigned Idx = 1;
-
-      // If this is a struct-return function, don't print the hidden
-      // struct-return argument.
-      if (isStructReturn) {
-        assert(I != E && "Invalid struct return function!");
-        ++I;
-        ++Idx;
-      }
-
-      for (; I != E; ++I) {
-        if (PrintedArg) FunctionInnards << ", ";
-        Type *ArgTy = *I;
-        if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
-          assert(ArgTy->isPointerTy());
-          ArgTy = cast<PointerType>(ArgTy)->getElementType();
-        }
-        printType(FunctionInnards, ArgTy,
-               /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt));
-        PrintedArg = true;
-        ++Idx;
-      }
-    }
-
-    if (!PrintedArg && FT->isVarArg()) {
-      FunctionInnards << "int vararg_dummy_arg";
-      PrintedArg = true;
-    }
-
-    // Finish printing arguments... if this is a vararg function, print the ...,
-    // unless there are no known types, in which case, we just emit ().
-    //
-    if (FT->isVarArg() && PrintedArg) {
-      FunctionInnards << ",...";  // Output varargs portion of signature!
-    } else if (!FT->isVarArg() && !PrintedArg) {
-      FunctionInnards << "void"; // ret() -> ret(void) in C.
-    }
-    FunctionInnards << ')';
-
-    // Get the return tpe for the function.
-    Type *RetTy;
-    if (!isStructReturn)
-      RetTy = F->getReturnType();
-    else {
-      // If this is a struct-return function, print the struct-return type.
-      RetTy = cast<PointerType>(FT->getParamType(0))->getElementType();
-    }
-
-    // Print out the return type and the signature built above.
-    printType(Out, RetTy,
-              /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt),
-              FunctionInnards.str());
-  }
-
   static inline bool isFPIntBitCast(const Instruction &I) {
     if (!isa<BitCastInst>(I))
       return false;
@@ -1947,20 +1800,22 @@ static std::string CBEMangle(const std::string &S) {
   void GenWriter::emitFunction(Function &F)
   {
     ctx.startFunction(GetValueName(&F));
-    this->registerMap.clear();
+    this->regTranslator.clear();
     this->labelMap.clear();
-    this->emitFunctionPrototype(&F);
+    this->emitFunctionPrototype(F);
 
-    // We create all the register variables
+    // Visit all the instructions and emit the IR registers or the value to
+    // value mapping
+    pass = PASS_EMIT_REGISTERS;
     for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I)
-      if (I->getType() != Type::getVoidTy(F.getContext()))
-        this->newRegister(&*I);
+      visit(*I);
 
     // First create all the labels (one per block)
     for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
       this->newLabelIndex(BB);
 
-    // ... then, emit the code for all basic blocks
+    // ... then, emit the instructions for all basic blocks
+    pass = PASS_EMIT_INSTRUCTIONS;
     for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
       emitBasicBlock(BB);
     ctx.endFunction();
@@ -2000,12 +1855,9 @@ static std::string CBEMangle(const std::string &S) {
     visit(*BB->getTerminator());
   }
 
+  void GenWriter::regAllocateReturnInst(ReturnInst &I) {}
 
-  // Specific Instruction type classes... note that all of the casts are
-  // necessary because we use the instruction classes as opaque types...
-  //
-  void GenWriter::visitReturnInst(ReturnInst &I) {
-    // If this is a struct return function, return the temporary struct.
+  void GenWriter::emitReturnInst(ReturnInst &I) {
     const ir::Function &fn = ctx.getFunction();
     GBE_ASSERTM(fn.outputNum() <= 1, "no more than one value can be returned");
     if (fn.outputNum() == 1 && I.getNumOperands() > 0) {
@@ -2015,30 +1867,8 @@ static std::string CBEMangle(const std::string &S) {
       ctx.MOV(ir::getType(family), dst, src);
     }
     ctx.RET();
-
-    bool isStructReturn = I.getParent()->getParent()->hasStructRetAttr();
-    if (isStructReturn) {
-      Out << "  return StructReturn;\n";
-      return;
-    }
-
-    // Don't output a void return if this is the last basic block in the function
-    if (I.getNumOperands() == 0 &&
-        &*--I.getParent()->getParent()->end() == I.getParent() &&
-        !I.getParent()->size() == 1) {
-      return;
-    }
-#if 0
-    Out << "  return";
-    if (I.getNumOperands()) {
-      Out << ' ';
-      writeOperand(I.getOperand(0));
-    }
-    Out << ";\n";
-#endif
   }
 
-
   bool GenWriter::isGotoCodeNecessary(BasicBlock *From, BasicBlock *To) {
     /// FIXME: This should be reenabled, but loop reordering safe!!
     return true;
@@ -2052,195 +1882,59 @@ static std::string CBEMangle(const std::string &S) {
       return true;
     return false;
   }
-#if 0
-  void GenWriter::printPHICopiesForSuccessor (BasicBlock *CurBlock,
-                                            BasicBlock *Successor,
-                                            unsigned Indent) {
-    for (BasicBlock::iterator I = Successor->begin(); isa<PHINode>(I); ++I) {
-      PHINode *PN = cast<PHINode>(I);
-      // Now we have to do the printing.
-      Value *IV = PN->getIncomingValueForBlock(CurBlock);
-      if (!isa<UndefValue>(IV)) {
-        Out << std::string(Indent, ' ');
-        Out << "  " << GetValueName(I) << "__PHI_TEMPORARY = ";
-        writeOperand(IV);
-        Out << ";   /* for PHI node */\n";
-      }
-    }
-  }
 
-  void GenWriter::printBranchToBlock(BasicBlock *CurBB, BasicBlock *Succ,
-                                   unsigned Indent) {
-    if (isGotoCodeNecessary(CurBB, Succ)) {
-      Out << std::string(Indent, ' ') << "  goto ";
-      writeOperand(Succ);
-      Out << ";\n";
-    }
-  }
-
-  // Branch instruction printing - Avoid printing out a branch to a basic block
-  // that immediately succeeds the current one.
-  //
-  void GenWriter::visitBranchInst(BranchInst &I)
+  void GenWriter::regAllocateBinaryOperator(Instruction &I)
   {
-
-    if (I.isConditional()) {
-      if (isGotoCodeNecessary(I.getParent(), I.getSuccessor(0))) {
-        Out << "  if (";
-        writeOperand(I.getCondition());
-        Out << ") {\n";
-
-        printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(0), 2);
-        printBranchToBlock(I.getParent(), I.getSuccessor(0), 2);
-
-        if (isGotoCodeNecessary(I.getParent(), I.getSuccessor(1))) {
-          Out << "  } else {\n";
-          printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(1), 2);
-          printBranchToBlock(I.getParent(), I.getSuccessor(1), 2);
-        }
-      } else {
-        // First goto not necessary, assume second one is...
-        Out << "  if (!";
-        writeOperand(I.getCondition());
-        Out << ") {\n";
-
-        printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(1), 2);
-        printBranchToBlock(I.getParent(), I.getSuccessor(1), 2);
-      }
-
-      Out << "  }\n";
-    } else {
-      printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(0), 0);
-      printBranchToBlock(I.getParent(), I.getSuccessor(0), 0);
-    }
-    Out << "\n";
+    this->newRegister(&I);
   }
 
-  // PHI nodes get copied into temporary values at the end of predecessor basic
-  // blocks.  We now need to copy these temporary values into the REAL value for
-  // the PHI.
-
-  void GenWriter::visitPHINode(PHINode &I) {
-    NOT_SUPPORTED;
-    writeOperand(&I);
-    Out << "__PHI_TEMPORARY";
-  }
-#endif
-
-  void GenWriter::visitBinaryOperator(Instruction &I)
+  void GenWriter::emitBinaryOperator(Instruction &I)
   {
-    GBE_ASSERT(!I.getType()->isPointerTy());
-    GBE_ASSERT(this->registerMap.find(&I) != this->registerMap.end());
-    const ir::Register dst = this->registerMap[&I];
-    const ir::Register src0 = this->getRegister(I.getOperand(0));
-    const ir::Register src1 = this->getRegister(I.getOperand(1));
-    const ir::Type type = this->getType(I.getType());
-
-    switch (I.getOpcode()) {
-      case Instruction::Add:
-      case Instruction::FAdd: ctx.ADD(type, dst, src0, src1); break;
-      case Instruction::Sub:
-      case Instruction::FSub: ctx.SUB(type, dst, src0, src1); break;
-      case Instruction::Mul:
-      case Instruction::FMul: ctx.MUL(type, dst, src0, src1); break;
-      case Instruction::URem:
-      case Instruction::SRem:
-      case Instruction::FRem: ctx.REM(type, dst, src0, src1); break;
-      case Instruction::UDiv:
-      case Instruction::SDiv:
-      case Instruction::FDiv: ctx.DIV(type, dst, src0, src1); break;
-      case Instruction::And:  ctx.AND(type, dst, src0, src1); break;
-      case Instruction::Or:   ctx.OR(type, dst, src0, src1); break;
-      case Instruction::Xor:  ctx.XOR(type, dst, src0, src1); break;
-      case Instruction::Shl : ctx.SHL(type, dst, src0, src1); break;
-      case Instruction::LShr: ctx.SHR(type, dst, src0, src1); break;
-      case Instruction::AShr: ctx.ASR(type, dst, src0, src1); break;
-      default:
-         GBE_ASSERT(0);
-    };
-
-#if 0
-    // binary instructions, shift instructions, setCond instructions.
-    assert(!I.getType()->isPointerTy());
-    // We must cast the results of binary operations which might be promoted.
-    bool needsCast = false;
-    if ((I.getType() == Type::getInt8Ty(I.getContext())) ||
-        (I.getType() == Type::getInt16Ty(I.getContext()))
-        || (I.getType() == Type::getFloatTy(I.getContext()))) {
-      needsCast = true;
-      Out << "((";
-      printType(Out, I.getType(), false);
-      Out << ")(";
-    }
-
-    // If this is a negation operation, print it out as such.  For FP, we don't
-    // want to print "-0.0 - X".
-    if (BinaryOperator::isNeg(&I)) {
-      Out << "-(";
-      writeOperand(BinaryOperator::getNegArgument(cast<BinaryOperator>(&I)));
-      Out << ")";
-    } else if (BinaryOperator::isFNeg(&I)) {
-      Out << "-(";
-      writeOperand(BinaryOperator::getFNegArgument(cast<BinaryOperator>(&I)));
-      Out << ")";
-    } else if (I.getOpcode() == Instruction::FRem) {
-      // Output a call to fmod/fmodf instead of emitting a%b
-      if (I.getType() == Type::getFloatTy(I.getContext()))
-        Out << "fmodf(";
-      else if (I.getType() == Type::getDoubleTy(I.getContext()))
-        Out << "fmod(";
-      else  // all 3 flavors of long double
-        Out << "fmodl(";
-      writeOperand(I.getOperand(0));
-      Out << ", ";
-      writeOperand(I.getOperand(1));
-      Out << ")";
+    GBE_ASSERT(I.getType()->isPointerTy() == false);
+
+    // Get the element type for a vector
+    ir::Type type;
+    uint32_t elemNum;
+    Type *llvmType = I.getType();
+    if (llvmType->isVectorTy() == true) {
+      VectorType *vectorType = cast<VectorType>(llvmType);
+      Type *elementType = vectorType->getElementType();
+      elemNum = vectorType->getNumElements();
+      type = getType(ctx, elementType);
     } else {
+      elemNum = 1;
+      type = getType(ctx, llvmType);
+    }
 
-      // Write out the cast of the instruction's value back to the proper type
-      // if necessary.
-      bool NeedsClosingParens = writeInstructionCast(I);
-
-      // Certain instructions require the operand to be forced to a specific type
-      // so we use writeOperandWithCast here instead of writeOperand. Similarly
-      // below for operand 1
-      writeOperandWithCast(I.getOperand(0), I.getOpcode());
+    // Emit the instructions in a row
+    for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
+      const ir::Register dst = this->getRegister(&I, elemID);
+      const ir::Register src0 = this->getRegister(I.getOperand(0), elemID);
+      const ir::Register src1 = this->getRegister(I.getOperand(1), elemID);
 
       switch (I.getOpcode()) {
-      case Instruction::Add:
-      case Instruction::FAdd: Out << " + "; break;
-      case Instruction::Sub:
-      case Instruction::FSub: Out << " - "; break;
-      case Instruction::Mul:
-      case Instruction::FMul: Out << " * "; break;
-      case Instruction::URem:
-      case Instruction::SRem:
-      case Instruction::FRem: Out << " % "; break;
-      case Instruction::UDiv:
-      case Instruction::SDiv:
-      case Instruction::FDiv: Out << " / "; break;
-      case Instruction::And:  Out << " & "; break;
-      case Instruction::Or:   Out << " | "; break;
-      case Instruction::Xor:  Out << " ^ "; break;
-      case Instruction::Shl : Out << " << "; break;
-      case Instruction::LShr:
-      case Instruction::AShr: Out << " >> "; break;
-      default:
-#ifndef NDEBUG
-         errs() << "Invalid operator type!" << I;
-#endif
-         llvm_unreachable(0);
-      }
-
-      writeOperandWithCast(I.getOperand(1), I.getOpcode());
-      if (NeedsClosingParens)
-        Out << "))";
-    }
-
-    if (needsCast) {
-      Out << "))";
+        case Instruction::Add:
+        case Instruction::FAdd: ctx.ADD(type, dst, src0, src1); break;
+        case Instruction::Sub:
+        case Instruction::FSub: ctx.SUB(type, dst, src0, src1); break;
+        case Instruction::Mul:
+        case Instruction::FMul: ctx.MUL(type, dst, src0, src1); break;
+        case Instruction::URem:
+        case Instruction::SRem:
+        case Instruction::FRem: ctx.REM(type, dst, src0, src1); break;
+        case Instruction::UDiv:
+        case Instruction::SDiv:
+        case Instruction::FDiv: ctx.DIV(type, dst, src0, src1); break;
+        case Instruction::And:  ctx.AND(type, dst, src0, src1); break;
+        case Instruction::Or:   ctx.OR(type, dst, src0, src1); break;
+        case Instruction::Xor:  ctx.XOR(type, dst, src0, src1); break;
+        case Instruction::Shl : ctx.SHL(type, dst, src0, src1); break;
+        case Instruction::LShr: ctx.SHR(type, dst, src0, src1); break;
+        case Instruction::AShr: ctx.ASR(type, dst, src0, src1); break;
+        default:
+           GBE_ASSERT(0);
+      };
     }
-#endif
   }
 
 #if 0
@@ -2322,85 +2016,41 @@ static std::string CBEMangle(const std::string &S) {
     Out << ")";
   }
 #endif
-#if 0
-  static const char * getFloatBitCastField(Type *Ty) {
-    switch (Ty->getTypeID()) {
-      default: llvm_unreachable("Invalid Type");
-      case Type::FloatTyID:  return "Float";
-      case Type::DoubleTyID: return "Double";
-      case Type::IntegerTyID: {
-        unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
-        if (NumBits <= 32)
-          return "Int32";
-        else
-          return "Int64";
-      }
-    }
-  }
-#endif
 
-#if 1
-  void GenWriter::visitCastInst(CastInst &I) {
+  void GenWriter::regAllocateCastInst(CastInst &I)
+  {
     if (I.getOpcode() == Instruction::PtrToInt ||
         I.getOpcode() == Instruction::IntToPtr) {
-      Value *srcValue = &I, *dstValue = I.getOperand(0);
-      Type *dstType = dstValue->getType();
-      Type *srcType = srcValue->getType();
-      const ir::Unit &unit = ctx.getUnit();
-      GBE_ASSERT(getTypeByteSize(unit, dstType) == getTypeByteSize(unit, srcType));
-      GBE_ASSERT(registerMap.find(dstValue) != registerMap.end());
-      registerMap[dstValue] = registerMap[srcValue];
-    } else
-      NOT_SUPPORTED;
-#if 0
-    Type *DstTy = I.getType();
-    Type *SrcTy = I.getOperand(0)->getType();
-    if (isFPIntBitCast(I)) {
-      Out << '(';
-      // These int<->float and long<->double casts need to be handled specially
-      Out << GetValueName(&I) << "__BITCAST_TEMPORARY."
-          << getFloatBitCastField(I.getOperand(0)->getType()) << " = ";
-      writeOperand(I.getOperand(0));
-      Out << ", " << GetValueName(&I) << "__BITCAST_TEMPORARY."
-          << getFloatBitCastField(I.getType());
-      Out << ')';
-      return;
-    }
-
-    Out << '(';
-    printCast(I.getOpcode(), SrcTy, DstTy);
-
-    // Make a sext from i1 work by subtracting the i1 from 0 (an int).
-    if (SrcTy == Type::getInt1Ty(I.getContext()) &&
-        I.getOpcode() == Instruction::SExt)
-      Out << "0-";
-
-    writeOperand(I.getOperand(0));
-
-    if (DstTy == Type::getInt1Ty(I.getContext()) &&
-        (I.getOpcode() == Instruction::Trunc ||
-         I.getOpcode() == Instruction::FPToUI ||
-         I.getOpcode() == Instruction::FPToSI ||
-         I.getOpcode() == Instruction::PtrToInt)) {
-      // Make sure we really get a trunc to bool by anding the operand with 1
-      Out << "&1u";
+      Value *dstValue = &I;
+      Value *srcValue = I.getOperand(0);
+      Constant *CPV = dyn_cast<Constant>(srcValue);
+      if (CPV == NULL) {
+        Type *dstType = dstValue->getType();
+        Type *srcType = srcValue->getType();
+        GBE_ASSERT(getTypeByteSize(unit, dstType) == getTypeByteSize(unit, srcType));
+        regTranslator.newValueProxy(srcValue, dstValue);
+      } else
+        this->newRegister(dstValue);
     }
-    Out << ')';
-#endif
+    else
+      NOT_SUPPORTED;
   }
-#endif
 
-#if 0
-  void GenWriter::visitSelectInst(SelectInst &I) {
-    Out << "((";
-    writeOperand(I.getCondition());
-    Out << ") ? (";
-    writeOperand(I.getTrueValue());
-    Out << ") : (";
-    writeOperand(I.getFalseValue());
-    Out << "))";
+  void GenWriter::emitCastInst(CastInst &I)
+  {
+    if (I.getOpcode() == Instruction::PtrToInt ||
+        I.getOpcode() == Instruction::IntToPtr) {
+      Value *srcValue = &I;
+      Value *dstValue = I.getOperand(0);
+      Constant *CPV = dyn_cast<Constant>(srcValue);
+      if (CPV != NULL) {
+        const ir::ImmediateIndex index = ctx.newImmediate(CPV);
+        const ir::Immediate imm = ctx.getImmediate(index);
+        const ir::Register reg = this->getRegister(dstValue);
+        ctx.LOADI(imm.type, reg, index);
+      }
+    }
   }
-#endif
 
 #ifndef NDEBUG
   static bool isSupportedIntegerSize(IntegerType &T) {
@@ -2427,8 +2077,8 @@ static std::string CBEMangle(const std::string &S) {
 #endif
   }
 
-  void GenWriter::visitCallInst(CallInst &I)
-  {
+  void GenWriter::emitCallInst(CallInst &I) {}
+  void GenWriter::regAllocateCallInst(CallInst &I) {
     Value *dst = &I;
     Value *Callee = I.getCalledValue();
     GBE_ASSERT(ctx.getFunction().getProfile() == ir::PROFILE_OCL);
@@ -2441,136 +2091,23 @@ static std::string CBEMangle(const std::string &S) {
     // With OCL there is no side effect for any called functions. So do nothing
     // when there is no returned value
     if (I.getType() == Type::getVoidTy(I.getContext()))
-      return;
+      NOT_SUPPORTED;
 
     // Get the name of the called function and handle it. We should use a hash
     // map later
     const std::string fnName = Callee->getName();
     if (fnName == "__gen_ocl_get_global_id0")
-      this->registerMap[dst] = ir::ocl::gid0;
+      regTranslator.newScalarProxy(ir::ocl::gid0, dst);
     else if (fnName == "__gen_ocl_get_global_id1")
-      this->registerMap[dst] = ir::ocl::gid1;
+      regTranslator.newScalarProxy(ir::ocl::gid1, dst);
     else if (fnName == "__gen_ocl_get_global_id2")
-      this->registerMap[dst] = ir::ocl::gid2;
+      regTranslator.newScalarProxy(ir::ocl::gid2, dst);
     else if (fnName == "__gen_ocl_get_local_id0")
-      this->registerMap[dst] = ir::ocl::lid0;
+      regTranslator.newScalarProxy(ir::ocl::lid0, dst);
     else if (fnName == "__gen_ocl_get_local_id1")
-      this->registerMap[dst] = ir::ocl::lid1;
+      regTranslator.newScalarProxy(ir::ocl::lid1, dst);
     else if (fnName == "__gen_ocl_get_local_id2")
-      this->registerMap[dst] = ir::ocl::lid2;
-
-#if 0
-      return visitInlineAsm(I);
-
-    bool WroteCallee = false;
-
-    // Handle intrinsic function calls first...
-    if (Function *F = I.getCalledFunction())
-      if (Intrinsic::ID ID = (Intrinsic::ID)F->getIntrinsicID())
-        if (visitBuiltinCall(I, ID, WroteCallee))
-          return;
-
-    Value *Callee = I.getCalledValue();
-    Out << (Callee->getName());
-    PointerType  *PTy   = cast<PointerType>(Callee->getType());
-    FunctionType *FTy   = cast<FunctionType>(PTy->getElementType());
-
-    // If this is a call to a struct-return function, assign to the first
-    // parameter instead of passing it to the call.
-    const AttrListPtr &PAL = I.getAttributes();
-    bool hasByVal = I.hasByValArgument();
-    bool isStructRet = I.hasStructRetAttr();
-    if (isStructRet) {
-      writeOperandDeref(I.getArgOperand(0));
-      Out << " = ";
-    }
-
-    if (I.isTailCall()) Out << " /*tail*/ ";
-
-    if (!WroteCallee) {
-      // If this is an indirect call to a struct return function, we need to cast
-      // the pointer. Ditto for indirect calls with byval arguments.
-      bool NeedsCast = (hasByVal || isStructRet) && !isa<Function>(Callee);
-
-      // GCC is a real PITA.  It does not permit codegening casts of functions to
-      // function pointers if they are in a call (it generates a trap instruction
-      // instead!).  We work around this by inserting a cast to void* in between
-      // the function and the function pointer cast.  Unfortunately, we can't just
-      // form the constant expression here, because the folder will immediately
-      // nuke it.
-      //
-      // Note finally, that this is completely unsafe.  ANSI C does not guarantee
-      // that void* and function pointers have the same size. :( To deal with this
-      // in the common case, we handle casts where the number of arguments passed
-      // match exactly.
-      //
-      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Callee))
-        if (CE->isCast())
-          if (Function *RF = dyn_cast<Function>(CE->getOperand(0))) {
-            NeedsCast = true;
-            Callee = RF;
-          }
-
-      if (NeedsCast) {
-        // Ok, just cast the pointer type.
-        Out << "((";
-        if (isStructRet)
-          printStructReturnPointerFunctionType(Out, PAL,
-                               cast<PointerType>(I.getCalledValue()->getType()));
-        else if (hasByVal)
-          printType(Out, I.getCalledValue()->getType(), false, "", true, PAL);
-        else
-          printType(Out, I.getCalledValue()->getType());
-        Out << ")(void*)";
-      }
-      writeOperand(Callee);
-      if (NeedsCast) Out << ')';
-    }
-
-    Out << '(';
-
-    bool PrintedArg = false;
-    if(FTy->isVarArg() && !FTy->getNumParams()) {
-      Out << "0 /*dummy arg*/";
-      PrintedArg = true;
-    }
-
-    unsigned NumDeclaredParams = FTy->getNumParams();
-    CallSite CS(&I);
-    CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
-    unsigned ArgNo = 0;
-    if (isStructRet) {   // Skip struct return argument.
-      ++AI;
-      ++ArgNo;
-    }
-
-
-    for (; AI != AE; ++AI, ++ArgNo) {
-      if (PrintedArg) Out << ", ";
-      if (ArgNo < NumDeclaredParams &&
-          (*AI)->getType() != FTy->getParamType(ArgNo)) {
-        Out << '(';
-        printType(Out, FTy->getParamType(ArgNo),
-              /*isSigned=*/PAL.paramHasAttr(ArgNo+1, Attribute::SExt));
-        Out << ')';
-      }
-      // Check if the argument is expected to be passed by value.
-      if (I.paramHasAttr(ArgNo+1, Attribute::ByVal))
-        writeOperandDeref(*AI);
-      else
-        writeOperand(*AI);
-      PrintedArg = true;
-    }
-    Out << ')';
-#endif
-  }
-
-  /// visitBuiltinCall - Handle the call to the specified builtin.  Returns true
-  /// if the entire call is handled, return false if it wasn't handled, and
-  /// optionally set 'WroteCallee' if the callee has already been printed out.
-  bool GenWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID, bool &WroteCallee) {
-    GBE_ASSERTM(false, "builtin call is not supported");
-    return false;
+      regTranslator.newScalarProxy(ir::ocl::lid2, dst);
   }
 
   void GenWriter::visitAllocaInst(AllocaInst &I) {
@@ -2601,9 +2138,13 @@ static std::string CBEMangle(const std::string &S) {
   static INLINE Value *getLoadOrStoreValue(StoreInst &I) {
     return I.getValueOperand();
   }
+  void GenWriter::regAllocateLoadInst(LoadInst &I) {
+    this->newRegister(&I);
+  }
+  void GenWriter::regAllocateStoreInst(StoreInst &I) {}
 
   template <bool isLoad, typename T>
-  INLINE void GenWriter::visitLoadOrStore(T &I)
+  INLINE void GenWriter::emitLoadOrStore(T &I)
   {
     GBE_ASSERTM(I.isVolatile() == false, "Volatile pointer is not supported");
     unsigned int llvmSpace = I.getPointerAddressSpace();
@@ -2612,124 +2153,47 @@ static std::string CBEMangle(const std::string &S) {
     Type *llvmType = llvmValues->getType();
     const bool dwAligned = (I.getAlignment() % 4) == 0;
     const ir::MemorySpace memSpace = addressSpaceLLVMToGen(llvmSpace);
-    const ir::Type type = getType(llvmType);
-    const ir::Register values = getRegister(llvmValues);
-    const ir::Register ptr = getRegister(llvmPtr);
-    if (isLoad)
-      ctx.LOAD(type, ptr, memSpace, dwAligned, values);
-    else
-      ctx.STORE(type, ptr, memSpace, dwAligned, values);
-  }
-
-  void GenWriter::visitLoadInst(LoadInst &I) {
-    this->visitLoadOrStore<true>(I);
-  }
-
-  void GenWriter::visitStoreInst(StoreInst &I) {
-    this->visitLoadOrStore<false>(I);
-  }
-
-#if 0
-  void GenWriter::visitInsertElementInst(InsertElementInst &I) {
-    Type *EltTy = I.getType()->getElementType();
-    writeOperand(I.getOperand(0));
-    Out << ";\n  ";
-    Out << "((";
-    printType(Out, PointerType::getUnqual(EltTy));
-    Out << ")(&" << GetValueName(&I) << "))[";
-    writeOperand(I.getOperand(2));
-    Out << "] = (";
-    writeOperand(I.getOperand(1));
-    Out << ")";
-  }
-
-  void GenWriter::visitExtractElementInst(ExtractElementInst &I) {
-    // We know that our operand is not inlined.
-    Out << "((";
-    Type *EltTy = cast<VectorType>(I.getOperand(0)->getType())->getElementType();
-    printType(Out, PointerType::getUnqual(EltTy));
-    Out << ")(&" << GetValueName(I.getOperand(0)) << "))[";
-    writeOperand(I.getOperand(1));
-    Out << "]";
-  }
-
-  void GenWriter::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
-    Out << "(";
-    printType(Out, SVI.getType());
-    Out << "){ ";
-    VectorType *VT = SVI.getType();
-    unsigned NumElts = VT->getNumElements();
-    Type *EltTy = VT->getElementType();
-
-    for (unsigned i = 0; i != NumElts; ++i) {
-      if (i) Out << ", ";
-      int SrcVal = SVI.getMaskValue(i);
-      if ((unsigned)SrcVal >= NumElts*2) {
-        Out << " 0/*undef*/ ";
-      } else {
-        Value *Op = SVI.getOperand((unsigned)SrcVal >= NumElts);
-        if (isa<Instruction>(Op)) {
-          // Do an extractelement of this value from the appropriate i. So do
-          // nothing when there is no returned valuenput.
-          Out << "((";
-          printType(Out, PointerType::getUnqual(EltTy));
-          Out << ")(&" << GetValueName(Op)
-              << "))[" << (SrcVal & (NumElts-1)) << "]";
-        } else if (isa<ConstantAggregateZero>(Op) || isa<UndefValue>(Op)) {
-          Out << "0";
-        } else {
-          printConstant(cast<ConstantVector>(Op)->getOperand(SrcVal &
-                                                             (NumElts-1)),
-                        false);
-        }
-      }
+    const ir::Register ptr = this->getRegister(llvmPtr);
+
+    // Scalar is easy. We neednot build register tuples
+    if (isScalarType(llvmType) == true) {
+      const ir::Type type = getType(ctx, llvmType);
+      const ir::Register values = this->getRegister(llvmValues);
+      if (isLoad)
+        ctx.LOAD(type, ptr, memSpace, dwAligned, values);
+      else
+        ctx.STORE(type, ptr, memSpace, dwAligned, values);
     }
-    Out << "}";
-  }
+    // A vector type requires to build a tuple
+    else {
+      VectorType *vectorType = cast<VectorType>(llvmType);
+      Type *elemType = vectorType->getElementType();
+
+      // Build the tuple data in the vector
+      vector<ir::Register> tupleData; // put registers here
+      const uint32_t elemNum = vectorType->getNumElements();
+      for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
+        const ir::Register reg = this->getRegister(llvmValues, elemID);
+        tupleData.push_back(reg);
+      }
+      const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], elemNum);
 
-  void GenWriter::visitInsertValueInst(InsertValueInst &IVI) {
-    // Start by copying the entire aggregate value into the result variable.
-        writeOperand(IVI.getOperand(0));
-    Out << ";\n  ";
-
-    // Then do the insert to update the field.
-    Out << GetValueName(&IVI);
-    for (const unsigned *b = IVI.idx_begin(), *i = b, *e = IVI.idx_end();
-         i != e; ++i) {
-      Type *IndexedTy =
-        ExtractValueInst::getIndexedType(IVI.getOperand(0)->getType(),
-                                         makeArrayRef(b, i+1));
-      if (IndexedTy->isArrayTy())
-        Out << ".array[" << *i << "]";
+      // Emit the instruction
+      const ir::Type type = getType(ctx, elemType);
+      if (isLoad)
+        ctx.LOAD(type, tuple, ptr, memSpace, elemNum, dwAligned);
       else
-        Out << ".field" << *i;
+        ctx.STORE(type, tuple, ptr, memSpace, elemNum, dwAligned);
     }
-    Out << " = ";
-    writeOperand(IVI.getOperand(1));
   }
 
-  void GenWriter::visitExtractValueInst(ExtractValueInst &EVI) {
-    Out << "(";
-    if (isa<UndefValue>(EVI.getOperand(0))) {
-      Out << "(";
-      printType(Out, EVI.getType());
-      Out << ") 0/*UNDEF*/";
-    } else {
-      Out << GetValueName(EVI.getOperand(0));
-      for (const unsigned *b = EVI.idx_begin(), *i = b, *e = EVI.idx_end();
-           i != e; ++i) {
-        Type *IndexedTy =
-          ExtractValueInst::getIndexedType(EVI.getOperand(0)->getType(),
-                                           makeArrayRef(b, i+1));
-        if (IndexedTy->isArrayTy())
-          Out << ".array[" << *i << "]";
-        else
-          Out << ".field" << *i;
-      }
-    }
-    Out << ")";
+  void GenWriter::emitLoadInst(LoadInst &I) {
+    this->emitLoadOrStore<true>(I);
+  }
+
+  void GenWriter::emitStoreInst(StoreInst &I) {
+    this->emitLoadOrStore<false>(I);
   }
-#endif
 
   llvm::FunctionPass *createGenPass(ir::Unit &unit) {
     return new GenWriter(unit);
diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
index bc30c1b..b2d6391 100644
--- a/backend/src/llvm/llvm_passes.cpp
+++ b/backend/src/llvm/llvm_passes.cpp
@@ -82,7 +82,11 @@ namespace gbe
 
     switch (Ty->getTypeID()) {
       case Type::VoidTyID: NOT_SUPPORTED;
-      case Type::VectorTyID: NOT_SUPPORTED;
+      case Type::VectorTyID:
+      {
+        const VectorType* VecTy = cast<VectorType>(Ty);
+        return VecTy->getNumElements() * getTypeByteSize(unit, VecTy->getElementType());
+      }
       case Type::PointerTyID:
       case Type::IntegerTyID:
       case Type::FloatTyID:
diff --git a/backend/src/utest/utest_llvm.cpp b/backend/src/utest/utest_llvm.cpp
index ff36d14..852f07d 100644
--- a/backend/src/utest/utest_llvm.cpp
+++ b/backend/src/utest/utest_llvm.cpp
@@ -76,12 +76,14 @@ runTests:
   GBE_ASSERT(dummyKernel != NULL);
   fclose(dummyKernel);
 
-  //UTEST_EXPECT_SUCCESS(utestLLVM2Gen("add.ll"));
-  //UTEST_EXPECT_SUCCESS(utestLLVM2Gen("load_store.ll"));
-  //UTEST_EXPECT_SUCCESS(utestLLVM2Gen("add2.ll"));
+  UTEST_EXPECT_SUCCESS(utestLLVM2Gen("add.ll"));
+  UTEST_EXPECT_SUCCESS(utestLLVM2Gen("load_store.ll"));
+  UTEST_EXPECT_SUCCESS(utestLLVM2Gen("add2.ll"));
   UTEST_EXPECT_SUCCESS(utestLLVM2Gen("get_global_id.ll"));
+  UTEST_EXPECT_SUCCESS(utestLLVM2Gen("simple_float4.ll"));
+  UTEST_EXPECT_SUCCESS(utestLLVM2Gen("simple_float4_2.ll"));
   //UTEST_EXPECT_SUCCESS(utestLLVM2Gen("loop.ll"));
-  //UTEST_EXPECT_SUCCESS(utestLLVM2Gen("void.ll"));
+  UTEST_EXPECT_SUCCESS(utestLLVM2Gen("void.ll"));
 }
 
 UTEST_REGISTER(utestLLVM)
-- 
2.7.4