From 7792e6429c8fca33f8bdfd2cadb2c874b65f450d Mon Sep 17 00:00:00 2001
From: Benjamin Segovia <segovia.benjamin@gmail.com>
Date: Tue, 10 Apr 2012 02:48:56 -0700
Subject: [PATCH] Completely redesigned the vector API for the simulator. Now
 it is properly untyped as the Gen IR is

---
 backend/src/backend/sim/sim_vector.h       | 520 ++++++++++++++++-------------
 backend/src/backend/sim/sim_vector_str.cpp | 520 ++++++++++++++++-------------
 backend/src/utest/utest_vector.cpp         | 334 +++++++++++++++---
 3 files changed, 874 insertions(+), 500 deletions(-)
diff --git a/backend/src/backend/sim/sim_vector.h b/backend/src/backend/sim/sim_vector.h
index 44cfdd5..1d09c98 100644
--- a/backend/src/backend/sim/sim_vector.h
+++ b/backend/src/backend/sim/sim_vector.h
@@ -35,309 +35,373 @@
 #include <cmath>
 
 #define INLINE inline __attribute__((always_inline))
+#define ID(X) (X)
+#define PS2SI(X) _mm_castps_si128(X)
+#define SI2PS(X) _mm_castsi128_ps(X)
 
-/*! Base structure for 1 / 4 / 8 / 16 / 32 floats */
-template <uint32_t vectorNum, bool scalar = false>
-struct genf { __m128 m[vectorNum]; };
-/*! Base structure for 1 / 4 / 8 / 16 / 32 integers */
-template <uint32_t vectorNum, bool scalar = false>
-struct geni { __m128i m[vectorNum]; };
-/*! Base structure for 1 / 4 / 8 / 16 / 32 booleans (m stands for "mask") */
-template <uint32_t vectorNum, bool scalar = false>
-struct genm { __m128 m[vectorNum]; };
-
-/*! To cast through memory */
-union CastType {
-  INLINE CastType(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) {
-    u[0] = u0; u[1] = u1; u[2] = u2; u[3] = u3;
-  }
-  INLINE CastType(float f0, float f1, float f2, float f3) {
-    f[0] = f0; f[1] = f1; f[2] = f2; f[3] = f3;
-  }
-  __m128 v;
-  __m128i vi;
-  uint32_t u[4];
-  float f[4];
-};
-
-typedef genf<1,true>  genf1; // contains 3 clobbered values
-typedef genf<1,false> genf4;
-typedef genf<2,false> genf8;
-typedef genf<4,false> genf16;
-typedef genf<8,false> genf32;
-typedef geni<1,true>  geni1; // contains 3 clobbered values
-typedef geni<1,false> geni4;
-typedef geni<2,false> geni8;
-typedef geni<4,false> geni16;
-typedef geni<8,false> geni32;
-typedef genm<1,true>  genm1; // contains 3 clobbered values
-typedef genm<1,false> genm4;
-typedef genm<2,false> genm8;
-typedef genm<4,false> genm16;
-typedef genm<8,false> genm32;
-
-static INLINE uint32_t elemNum(genf1 x)  { return 1; }
-static INLINE uint32_t elemNum(genf4 x)  { return 4; }
-static INLINE uint32_t elemNum(genf8 x)  { return 8; }
-static INLINE uint32_t elemNum(genf16 x) { return 16; }
-static INLINE uint32_t elemNum(genf32 x) { return 32; }
-static INLINE uint32_t elemNum(geni1 x)  { return 1; }
-static INLINE uint32_t elemNum(geni4 x)  { return 4; }
-static INLINE uint32_t elemNum(geni8 x)  { return 8; }
-static INLINE uint32_t elemNum(geni16 x) { return 16; }
-static INLINE uint32_t elemNum(geni32 x) { return 32; }
-
+/* Some extra SSE functions */
 template<size_t i0, size_t i1, size_t i2, size_t i3>
 INLINE const __m128 shuffle(const __m128& b) {
   return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));
 }
-
 template<size_t i> INLINE
 __m128 expand(const __m128& b) { 
   return shuffle<i, i, i, i>(b);
 }
-
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 INLINE const __m128i shuffle(const __m128i& a) {
   return _mm_shuffle_epi32(a, _MM_SHUFFLE(index_3, index_2, index_1, index_0));
 }
-
 template<size_t index>
 INLINE const __m128i expand(const __m128i& b) {
   return shuffle<index, index, index, index>(b);
 }
 
+/*! Base structure for scalar double word */
+union scalar_dw { uint32_t u; int32_t s; float f; };
+
+/*! Base structure for scalar mask */
+union scalar_m { uint32_t u; int32_t s; float f; };
+
+/*! Base structure for vectors 4 / 8 / 16 / 32 double words */
+template <uint32_t vectorNum>
+struct simd_dw {
+  INLINE simd_dw(void) {}
+  INLINE simd_dw(const scalar_dw &s) {
+    for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&s.f);
+  }
+  simd_dw &operator= (const scalar_dw &s) {
+    for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&s.f);
+    return *this;
+  }
+  __m128 m[vectorNum];
+};
+
+/*! Base structure for 4 / 8 / 16 / 32 booleans (m stands for "mask") */
+template <uint32_t vectorNum>
+struct simd_m {
+  INLINE simd_m(void) {}
+  INLINE simd_m(scalar_m s) {
+    for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&s.f);
+  }
+  __m128 m[vectorNum];
+};
+
+/*! To cast through memory */
+union cast_dw {
+  INLINE cast_dw(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) {
+    u[0] = u0; u[1] = u1; u[2] = u2; u[3] = u3;
+  }
+  INLINE cast_dw(int32_t s0, int32_t s1, int32_t s2, int32_t s3) {
+    s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
+  }
+  INLINE cast_dw(float f0, float f1, float f2, float f3) {
+    f[0] = f0; f[1] = f1; f[2] = f2; f[3] = f3;
+  }
+  INLINE cast_dw(const __m128 &v) : v(v) {}
+  INLINE cast_dw(const __m128i &vi) : vi(vi) {}
+  INLINE cast_dw(void) {}
+  __m128 v;
+  __m128i vi;
+  uint32_t u[4];
+  int32_t s[4];
+  float f[4];
+};
+static const cast_dw alltrue(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
+
+/* Some convenient typedefs */
+typedef scalar_dw  simd1dw;
+typedef simd_dw<1> simd4dw;
+typedef simd_dw<2> simd8dw;
+typedef simd_dw<4> simd16dw;
+typedef simd_dw<8> simd32dw;
+typedef scalar_m   simd1m;
+typedef simd_m<1>  simd4m;
+typedef simd_m<2>  simd8m;
+typedef simd_m<4>  simd16m;
+typedef simd_m<8>  simd32m;
+
+//////////////////////////////////////////////////////////////////////////////
+// Vector instructions
+//////////////////////////////////////////////////////////////////////////////
+/* Simple function to get the number of element per vector */
+template <uint32_t vectorNum>
+INLINE uint32_t elemNum(const simd_dw<vectorNum> &x) {
+  return 4 * vectorNum;
+}
+template <uint32_t vectorNum>
+INLINE uint32_t elemNum(const simd_m<vectorNum> &x) {
+  return 4 * vectorNum;
+}
+
 /* Build an integer mask from the mask vectors */
 template <uint32_t vectorNum>
-INLINE uint32_t mask(const genm<vectorNum> v) {
+INLINE uint32_t mask(const simd_m<vectorNum> v) {
   uint32_t m = _mm_movemask_ps(v.m[0]);
   for (uint32_t i = 1; i < vectorNum; ++i)
-    m |= _mm_movemask_ps(v.m[i]) << (4*i);
+    m |= (_mm_movemask_ps(v.m[i]) << (4*i));
   return m;
 }
-INLINE uint32_t mask(const genm1 &v) { return _mm_movemask_ps(v.m[0]) & 1; }
 
-#define ID(X) X
+/* Vector instructions that use sse* */
 #define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\
 template <uint32_t vectorNum>\
-INLINE void NAME(DST_TYPE<vectorNum> &dst,\
-                 const SRC_TYPE<vectorNum> &v0,\
-                 const SRC_TYPE<vectorNum> &v1) {\
+INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\
   for (uint32_t i = 0; i < vectorNum; ++i)\
     dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(v1.m[i])));\
 }\
 template <uint32_t vectorNum>\
-INLINE void NAME(DST_TYPE<vectorNum> &dst,\
-                 const SRC_TYPE<vectorNum> &v0,\
-                 const SRC_TYPE##1 &v1) {\
-  for (uint32_t i = 0; i < vectorNum; ++i)\
-    dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(expand<0>(v1.m[0]))));\
+INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\
+  NAME(dst, v0, simd_dw<vectorNum>(v1));\
 }\
 template <uint32_t vectorNum>\
-INLINE void NAME(DST_TYPE<vectorNum> &dst,\
-                 const SRC_TYPE##1 &v0,\
-                 const SRC_TYPE<vectorNum> &v1) {\
-  for (uint32_t i = 0; i < vectorNum; ++i)\
-    dst.m[i] = FN(INTRINSIC_NAME(FN0(expand<0>(v0.m[0])), FN1(v1.m[i])));\
+INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\
+  NAME(dst, simd_dw<vectorNum>(v0), v1);\
 }
-
-VEC_OP(genf, genf, ADD, _mm_add_ps, ID, ID, ID);
-VEC_OP(genf, genf, SUB, _mm_sub_ps, ID, ID, ID);
-VEC_OP(genf, genf, MUL, _mm_mul_ps, ID, ID, ID);
-VEC_OP(genf, genf, DIV, _mm_div_ps, ID, ID, ID);
-VEC_OP(genm, genf, EQ, _mm_cmpeq_ps, ID, ID, ID);
-VEC_OP(genm, genf, NE, _mm_cmpneq_ps, ID, ID, ID);
-VEC_OP(genm, genf, LT, _mm_cmplt_ps, ID, ID, ID);
-VEC_OP(genm, genf, LE, _mm_cmple_ps, ID, ID, ID);
-VEC_OP(genm, genf, GT, _mm_cmpgt_ps, ID, ID, ID);
-VEC_OP(genm, genf, GE, _mm_cmpge_ps, ID, ID, ID);
-VEC_OP(geni, geni, ADD, _mm_add_epi32, ID, ID, ID);
-VEC_OP(geni, geni, SUB, _mm_sub_epi32, ID, ID, ID);
-VEC_OP(genm, geni, EQ, _mm_cmpeq_epi32, ID, ID, ID);
-VEC_OP(genm, geni, SLT, _mm_cmplt_epi32, ID, ID, ID);
-VEC_OP(genm, geni, SGT, _mm_cmpgt_epi32, ID, ID, ID);
-VEC_OP(geni, geni, OR, _mm_or_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps);
-VEC_OP(geni, geni, XOR, _mm_xor_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps);
-VEC_OP(geni, geni, AND, _mm_and_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps);
-VEC_OP(genm, genf, SLT, _mm_cmplt_ps, ID, ID, ID);
-VEC_OP(genm, genf, SLE, _mm_cmple_ps, ID, ID, ID);
-VEC_OP(genm, genf, SGT, _mm_cmpgt_ps, ID, ID, ID);
-VEC_OP(genm, genf, SGE, _mm_cmpge_ps, ID, ID, ID);
-
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, ADD_F, _mm_add_ps, ID, ID, ID);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, SUB_F, _mm_sub_ps, ID, ID, ID);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, MUL_F, _mm_mul_ps, ID, ID, ID);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, DIV_F, _mm_div_ps, ID, ID, ID);
+VEC_OP(simd_m<vectorNum>,  simd_dw<vectorNum>, EQ_F, _mm_cmpeq_ps, ID, ID, ID);
+VEC_OP(simd_m<vectorNum>,  simd_dw<vectorNum>, NE_F, _mm_cmpneq_ps, ID, ID, ID);
+VEC_OP(simd_m<vectorNum>,  simd_dw<vectorNum>, LT_F, _mm_cmplt_ps, ID, ID, ID);
+VEC_OP(simd_m<vectorNum>,  simd_dw<vectorNum>, LE_F, _mm_cmple_ps, ID, ID, ID);
+VEC_OP(simd_m<vectorNum>,  simd_dw<vectorNum>, GT_F, _mm_cmpgt_ps, ID, ID, ID);
+VEC_OP(simd_m<vectorNum>,  simd_dw<vectorNum>, GE_F, _mm_cmpge_ps, ID, ID, ID);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, ADD_S32, _mm_add_epi32, SI2PS, PS2SI, PS2SI);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, SUB_S32, _mm_sub_epi32, SI2PS, PS2SI, PS2SI);
+VEC_OP(simd_m<vectorNum>,  simd_dw<vectorNum>, EQ_S32, _mm_cmpeq_epi32, SI2PS, PS2SI, PS2SI);
+VEC_OP(simd_m<vectorNum>,  simd_dw<vectorNum>, LT_S32, _mm_cmplt_epi32, SI2PS, PS2SI, PS2SI);
+VEC_OP(simd_m<vectorNum>,  simd_dw<vectorNum>, GT_S32, _mm_cmpgt_epi32, SI2PS, PS2SI, PS2SI);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, OR_S32, _mm_or_ps, ID, ID, ID);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, XOR_S32, _mm_xor_ps, ID, ID, ID);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, AND_S32, _mm_and_ps, ID, ID, ID);
 #undef VEC_OP
 
-#define ICMP_VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\
+/* Vector integer operations that we can get by switching argument order */
+#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\
 template <uint32_t vectorNum>\
-INLINE void NAME(DST_TYPE<vectorNum> &dst,\
-                 const SRC_TYPE<vectorNum> &v0,\
-                 const SRC_TYPE<vectorNum> &v1) {\
+INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\
   for (uint32_t i = 0; i < vectorNum; ++i)\
-    dst.m[i] = FN(INTRINSIC_NAME(FN1(v1.m[i]), FN0(v0.m[i])));\
+    dst.m[i] = _mm_xor_ps(FN(INTRINSIC_NAME(FN1(v0.m[i]), FN0(v1.m[i]))), alltrue.v);\
 }\
 template <uint32_t vectorNum>\
-INLINE void NAME(DST_TYPE<vectorNum> &dst,\
-                 const SRC_TYPE<vectorNum> &v0,\
-                 const SRC_TYPE##1 &v1) {\
-  for (uint32_t i = 0; i < vectorNum; ++i)\
-    dst.m[i] = FN(INTRINSIC_NAME(FN1(expand<0>(v1.m[0])), FN0(v0.m[i])));\
+INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\
+  NAME(dst, v0, simd_dw<vectorNum>(v1));\
 }\
 template <uint32_t vectorNum>\
-INLINE void NAME(DST_TYPE<vectorNum> &dst,\
-                 const SRC_TYPE##1 &v0,\
-                 const SRC_TYPE<vectorNum> &v1) {\
-  for (uint32_t i = 0; i < vectorNum; ++i)\
-    dst.m[i] = FN(INTRINSIC_NAME(FN1(v1.m[i]), FN0(expand<0>(v0.m[0]))));\
+INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\
+  NAME(dst, simd_dw<vectorNum>(v0), v1);\
+}
+VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, GE_S32, _mm_cmplt_epi32, SI2PS, PS2SI, PS2SI);
+VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, LE_S32, _mm_cmpgt_epi32, SI2PS, PS2SI, PS2SI);
+#undef VEC_OP
+
+/* Vector binary integer operations that require C */
+#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, OP, FIELD)\
+template <uint32_t vectorNum>\
+INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\
+  for (uint32_t i = 0; i < vectorNum; ++i) {\
+    cast_dw c0(v0.m[i]), c1(v1.m[i]), d;\
+    for (uint32_t j = 0; j < 4; ++j)\
+      d.FIELD[j] = c0.FIELD[j] OP c1.FIELD[j];\
+    dst.m[i] = d.v;\
+  }\
+}\
+template <uint32_t vectorNum>\
+INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\
+  NAME(dst, v0, simd_dw<vectorNum>(v1));\
+}\
+template <uint32_t vectorNum>\
+INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\
+  NAME(dst, simd_dw<vectorNum>(v0), v1);\
 }
-ICMP_VEC_OP(genm, geni, SGE, _mm_cmplt_epi32, ID, ID, ID);
-ICMP_VEC_OP(genm, geni, SLE, _mm_cmpgt_epi32, ID, ID, ID);
-#undef ICMP_VEC_OP
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, MUL_S32, *, s);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, DIV_S32, /, s);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, REM_S32, %, s);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, MUL_U32, *, u);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, DIV_U32, /, u);
+VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, REM_U32, %, u);
+#undef VEC_OP
 
-static const CastType alltrue(0xffffffff,0xffffffff,0xffffffff,0xffffffff);
+/* Vector compare vectors that require C */
+#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, OP, FIELD)\
+template <uint32_t vectorNum>\
+INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\
+  for (uint32_t i = 0; i < vectorNum; ++i) {\
+    cast_dw c0(v0.m[i]), c1(v1.m[i]), d;\
+    for (uint32_t j = 0; j < 4; ++j)\
+      d.u[j] = (c0.FIELD[j] OP c1.FIELD[j]) ? ~0u : 0u;\
+    dst.m[i] = d.v;\
+  }\
+}\
+template <uint32_t vectorNum>\
+INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\
+  for (uint32_t i = 0; i < vectorNum; ++i) {\
+    cast_dw c0(v0.m[i]), d;\
+    for (uint32_t j = 0; j < 4; ++j)\
+      d.u[j] = (c0.FIELD[j] OP v1.FIELD) ? ~0u : 0u;\
+    dst.m[i] = d.v;\
+  }\
+}\
+template <uint32_t vectorNum>\
+INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\
+  for (uint32_t i = 0; i < vectorNum; ++i) {\
+    cast_dw c1(v1.m[i]), d;\
+    for (uint32_t j = 0; j < 4; ++j)\
+      d.u[j] = (v0.FIELD OP c1.FIELD[j]) ? ~0u : 0u;\
+    dst.m[i] = d.v;\
+  }\
+}
+VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, LE_U32, <=, u);
+VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, LT_U32, <, u);
+VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, GE_U32, >=, u);
+VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, GT_U32, >, u);
+#undef VEC_OP
 
 template <uint32_t vectorNum>
-INLINE void NE(genm<vectorNum> &dst, const geni<vectorNum> &v0, const geni<vectorNum> &v1) {
-  for (uint32_t i = 0; i < vectorNum; ++i)
-    dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[i], v1.m[i])),alltrue.v));
+INLINE void NE_S32(simd_m<vectorNum> &dst,
+                   const simd_dw<vectorNum> &v0,
+                   const scalar_dw &v1)
+{
+  NE_S32(dst, v0, simd_dw<vectorNum>(v1));
 }
 template <uint32_t vectorNum>
-INLINE void NE(genm<vectorNum> &dst, const geni<vectorNum> &v0, const geni1 &v1) {
-  for (uint32_t i = 0; i < vectorNum; ++i)
-    dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[i], v1.m[0])),alltrue.v));
+INLINE void NE_S32(simd_m<vectorNum> &dst,
+                   const scalar_dw &v0,
+                   const simd_dw<vectorNum> &v1)
+{
+  NE_S32(dst, simd_dw<vectorNum>(v0), v1);
 }
 template <uint32_t vectorNum>
-INLINE void NE(genm<vectorNum> &dst, const geni1 &v0, const geni<vectorNum> &v1) {
+INLINE void NE_S32(simd_m<vectorNum> &dst,
+                   const simd_dw<vectorNum> &v0,
+                   const simd_dw<vectorNum> &v1)
+{
   for (uint32_t i = 0; i < vectorNum; ++i)
-    dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[0], v1.m[i])),alltrue.v));
+    dst.m[i] = _mm_xor_ps(alltrue.v, SI2PS(_mm_cmpeq_epi32(PS2SI(v0.m[i]), PS2SI(v1.m[i]))));
 }
 
-#define SCALAR_OP(TYPE, NAME, INTRINSIC_NAME)\
-INLINE void NAME(TYPE &dst, const TYPE &v0, const TYPE &v1) {\
-  dst.m[0] = INTRINSIC_NAME(v0.m[0], v1.m[0]);\
-}
-SCALAR_OP(genf1, ADD, _mm_add_ss);
-SCALAR_OP(genf1, SUB, _mm_sub_ss);
-SCALAR_OP(genf1, MUL, _mm_mul_ss);
-SCALAR_OP(genf1, DIV, _mm_div_ss);
-SCALAR_OP(geni1, ADD, _mm_add_epi32);
-SCALAR_OP(geni1, SUB, _mm_sub_epi32);
-#undef SCALAR_OP
-
-/* load from contiguous floats / integers */
+/* Load from contiguous double words */
 template <uint32_t vectorNum>
-INLINE void LOAD(genf<vectorNum> &dst, const char *ptr) {
+INLINE void LOAD(simd_dw<vectorNum> &dst, const char *ptr) {
   for (uint32_t i = 0; i < vectorNum; ++i)
     dst.m[i] = _mm_loadu_ps((const float*) ptr + 4*i);
 }
-INLINE void LOAD(genf1 &dst, const char *ptr) {
-  dst.m[0] = _mm_load_ss((const float*) ptr);
-}
-template <uint32_t vectorNum>
-INLINE void LOAD(geni<vectorNum> &dst, const char *ptr) {
-  for (uint32_t i = 0; i < vectorNum; ++i)
-    dst.m[i] = _mm_castps_si128(_mm_loadu_ps((const float*) ptr + 4*i));
-}
-INLINE void LOAD(geni1 &dst, const char *ptr) {
-  dst.m[0] = _mm_castps_si128(_mm_load_ss((const float*) ptr));
-}
 
-/* store to contiguous floats / integers */
+/* Store to contiguous double words */
 template <uint32_t vectorNum>
-INLINE void STORE(const genf<vectorNum> &src, char *ptr) {
+INLINE void STORE(const simd_dw<vectorNum> &src, char *ptr) {
   for (uint32_t i = 0; i < vectorNum; ++i)
     _mm_storeu_ps((float*) ptr + 4*i, src.m[i]);
 }
-INLINE void STORE(genf1 &src, char *ptr) {
-  _mm_store_ss((float*) ptr, src.m[0]);
-}
-template <uint32_t vectorNum>
-INLINE void STORE(const geni<vectorNum> &src, char *ptr) {
-  for (uint32_t i = 0; i < vectorNum; ++i)
-     _mm_storeu_ps((float*) ptr + 4*i, _mm_castsi128_ps(src.m[i]));
-}
-INLINE void STORE(const geni1 &src, char *ptr) {
-  _mm_store_ss((float*) ptr, _mm_castsi128_ps(src.m[0]));
-}
 
 /* Load immediates */
 template <uint32_t vectorNum>
-INLINE void LOADI(genf<vectorNum> &dst, float f) {
+INLINE void LOADI(simd_dw<vectorNum> &dst, float f) {
   for (uint32_t i = 0; i < vectorNum; ++i)
     dst.m[i] = _mm_load1_ps(&f);
 }
-INLINE void LOADI(genf1 &dst, float f) { dst.m[0] = _mm_load_ss(&f); }
-template <uint32_t vectorNum>
-INLINE void LOADI(geni<vectorNum> &dst, uint32_t u) {
-  union { float f; uint32_t u; } cast;
-  cast.u = u;
-  for (uint32_t i = 0; i < vectorNum; ++i)
-    dst.m[i] = _mm_castps_si128(_mm_load1_ps(&cast.f));
-}
-INLINE void LOADI(geni1 &dst, uint32_t u) {
-  union { float f; uint32_t u; } cast;
-  cast.u = u;
-  dst.m[0] = _mm_castps_si128(_mm_load_ss(&cast.f));
-}
 
 /* Scatter */
-#define SCATTER_OP(TYPE, FN)\
-template <uint32_t vectorNum>\
-INLINE void SCATTER(const TYPE<vectorNum> &value,\
-                    const geni<vectorNum> &offset,\
-                    char *base_address) {\
-  for (uint32_t i = 0; i < vectorNum; ++i) {\
-    const int v0 = _mm_extract_epi32(FN(value.m[i]), 0);\
-    const int v1 = _mm_extract_epi32(FN(value.m[i]), 1);\
-    const int v2 = _mm_extract_epi32(FN(value.m[i]), 2);\
-    const int v3 = _mm_extract_epi32(FN(value.m[i]), 3);\
-    const int o0 = _mm_extract_epi32(offset.m[i], 0);\
-    const int o1 = _mm_extract_epi32(offset.m[i], 1);\
-    const int o2 = _mm_extract_epi32(offset.m[i], 2);\
-    const int o3 = _mm_extract_epi32(offset.m[i], 3);\
-    *(int*)(base_address + o0) = v0;\
-    *(int*)(base_address + o1) = v1;\
-    *(int*)(base_address + o2) = v2;\
-    *(int*)(base_address + o3) = v3;\
-  }\
-}\
-INLINE void SCATTER(const TYPE##1 &value, const geni1 &offset, char *base_address) {\
-  const int v0 = _mm_extract_epi32(FN(value.m[0]), 0);\
-  const int o0 = _mm_extract_epi32(offset.m[0], 0);\
-  *(int*)(base_address + o0) = v0;\
+template <uint32_t vectorNum>
+INLINE void SCATTER(const simd_dw<vectorNum> &value,
+                    const simd_dw<vectorNum> &offset,
+                    char *base_address) {
+  for (uint32_t i = 0; i < vectorNum; ++i) {
+    const int v0 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 0);
+    const int v1 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 1);
+    const int v2 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 2);
+    const int v3 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 3);
+    const int o0 = _mm_extract_epi32(offset.m[i], 0);
+    const int o1 = _mm_extract_epi32(offset.m[i], 1);
+    const int o2 = _mm_extract_epi32(offset.m[i], 2);
+    const int o3 = _mm_extract_epi32(offset.m[i], 3);
+    *(int*)(base_address + o0) = v0;
+    *(int*)(base_address + o1) = v1;
+    *(int*)(base_address + o2) = v2;
+    *(int*)(base_address + o3) = v3;
+  }
 }
-SCATTER_OP(genf, _mm_castps_si128)
-SCATTER_OP(geni, ID)
-#undef SCATTER_OP
 
 /* Gather */
-#define GATHER_OP(TYPE, FN)\
-template <uint32_t vectorNum>\
-INLINE void GATHER(TYPE<vectorNum> &dst,\
-                   const geni<vectorNum> &offset,\
-                   char *base_address) {\
-  for (uint32_t i = 0; i < vectorNum; ++i) {\
-    const int o0 = _mm_extract_epi32(offset.m[i], 0);\
-    const int o1 = _mm_extract_epi32(offset.m[i], 1);\
-    const int o2 = _mm_extract_epi32(offset.m[i], 2);\
-    const int o3 = _mm_extract_epi32(offset.m[i], 3);\
-    const int v0 = *(int*)(base_address + o0);\
-    const int v1 = *(int*)(base_address + o1);\
-    const int v2 = *(int*)(base_address + o2);\
-    const int v3 = *(int*)(base_address + o3);\
-    _mm_insert_epi32(FN(dst.m[i]), v0, 0);\
-    _mm_insert_epi32(FN(dst.m[i]), v1, 1);\
-    _mm_insert_epi32(FN(dst.m[i]), v2, 2);\
-    _mm_insert_epi32(FN(dst.m[i]), v3, 3);\
-  }\
-}\
-INLINE void GATHER(TYPE##1 &dst, const geni1 &offset, char *base_address) {\
-    const int o0 = _mm_extract_epi32(offset.m[0], 0);\
-    const int v0 = *(int*)(base_address + o0);\
-    _mm_insert_epi32(FN(dst.m[0]), v0, 0);\
+template <uint32_t vectorNum>
+INLINE void GATHER(simd_dw<vectorNum> &dst,
+                   const simd_dw<vectorNum> &offset,
+                   char *base_address) {
+  for (uint32_t i = 0; i < vectorNum; ++i) {
+    const int o0 = _mm_extract_epi32(offset.m[i], 0);
+    const int o1 = _mm_extract_epi32(offset.m[i], 1);
+    const int o2 = _mm_extract_epi32(offset.m[i], 2);
+    const int o3 = _mm_extract_epi32(offset.m[i], 3);
+    const int v0 = *(int*)(base_address + o0);
+    const int v1 = *(int*)(base_address + o1);
+    const int v2 = *(int*)(base_address + o2);
+    const int v3 = *(int*)(base_address + o3);
+    _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v0, 0);
+    _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v1, 1);
+    _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v2, 2);
+    _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v3, 3);
+  }
 }
-GATHER_OP(genf, _mm_castps_si128)
-GATHER_OP(geni, ID)
-#undef GATHER_OP
 
+//////////////////////////////////////////////////////////////////////////////
+// Scalar instructions
+//////////////////////////////////////////////////////////////////////////////
+INLINE uint32_t elemNum(const scalar_dw &x) { return 1; }
+INLINE uint32_t elemNum(const scalar_m &x) { return 1; }
+INLINE uint32_t mask(const scalar_m &v) { return v.u ? 1 : 0; }
+INLINE void ADD_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f + v1.f; }
+INLINE void SUB_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f - v1.f; }
+INLINE void MUL_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f * v1.f; }
+INLINE void DIV_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f / v1.f; }
+INLINE void EQ_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f == v1.f ? ~0 : 0); }
+INLINE void NE_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f != v1.f ? ~0 : 0); }
+INLINE void LE_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f <= v1.f ? ~0 : 0); }
+INLINE void LT_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f < v1.f ? ~0 : 0); }
+INLINE void GE_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f >= v1.f ? ~0 : 0); }
+INLINE void GT_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f > v1.f ? ~0 : 0); }
+INLINE void ADD_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s + v1.s; }
+INLINE void SUB_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s - v1.s; }
+INLINE void MUL_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s * v1.s; }
+INLINE void DIV_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s / v1.s; }
+INLINE void REM_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s % v1.s; }
+INLINE void MUL_U32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.u = v0.u * v1.u; }
+INLINE void DIV_U32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.u = v0.u / v1.u; }
+INLINE void REM_U32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.u = v0.u % v1.u; }
+INLINE void EQ_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s == v1.s ? ~0 : 0); }
+INLINE void NE_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s != v1.s ? ~0 : 0); }
+INLINE void LE_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s <= v1.s ? ~0 : 0); }
+INLINE void LT_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s < v1.s ? ~0 : 0); }
+INLINE void GE_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s >= v1.s ? ~0 : 0); }
+INLINE void GT_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s > v1.s ? ~0 : 0); }
+INLINE void XOR_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s ^ v1.s; }
+INLINE void OR_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s | v1.s; }
+INLINE void AND_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s & v1.s; }
+INLINE void LE_U32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.u <= v1.u ? ~0 : 0); }
+INLINE void LT_U32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.u < v1.u ? ~0 : 0); }
+INLINE void GE_U32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.u >= v1.u ? ~0 : 0); }
+INLINE void GT_U32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.u > v1.u ? ~0 : 0); }
+INLINE void LOAD(scalar_dw &dst, const char *ptr) { dst.u = *(const uint32_t *) ptr; }
+INLINE void STORE(scalar_dw src, char *ptr) { *(uint32_t *) ptr = src.u; }
+INLINE void LOADI(scalar_dw &dst, uint32_t u) { dst.u = u; }
+INLINE void SCATTER(scalar_dw value, scalar_dw offset, char *base) { *(uint32_t*)(base + offset.u) = value.u; }
+INLINE void GATHER(scalar_dw &dst, scalar_dw offset, char *base) { dst.u = *(uint32_t*)(base + offset.u); }
+
+//////////////////////////////////////////////////////////////////////////////
+// Identical instructions are forwarded
+//////////////////////////////////////////////////////////////////////////////
+
+#define ADD_U32 ADD_S32
+#define SUB_U32 SUB_S32
+#define XOR_U32 XOR_S32
+#define OR_U32 OR_S32
+#define AND_U32 AND_S32
+#define EQ_U32 EQ_S32
+#define NE_U32 NE_S32
+
+#undef PS2SI
+#undef SI2PS
 #undef ID
 #undef INLINE
 
diff --git a/backend/src/backend/sim/sim_vector_str.cpp b/backend/src/backend/sim/sim_vector_str.cpp
index e37d7f2..87ff6de 100644
--- a/backend/src/backend/sim/sim_vector_str.cpp
+++ b/backend/src/backend/sim/sim_vector_str.cpp
@@ -61,309 +61,373 @@ std::string sim_vector_str =
 "#include <cmath>\n"
 "\n"
 "#define INLINE inline __attribute__((always_inline))\n"
+"#define ID(X) (X)\n"
+"#define PS2SI(X) _mm_castps_si128(X)\n"
+"#define SI2PS(X) _mm_castsi128_ps(X)\n"
 "\n"
-"/*! Base structure for 1 / 4 / 8 / 16 / 32 floats */\n"
-"template <uint32_t vectorNum, bool scalar = false>\n"
-"struct genf { __m128 m[vectorNum]; };\n"
-"/*! Base structure for 1 / 4 / 8 / 16 / 32 integers */\n"
-"template <uint32_t vectorNum, bool scalar = false>\n"
-"struct geni { __m128i m[vectorNum]; };\n"
-"/*! Base structure for 1 / 4 / 8 / 16 / 32 booleans (m stands for \"mask\") */\n"
-"template <uint32_t vectorNum, bool scalar = false>\n"
-"struct genm { __m128 m[vectorNum]; };\n"
-"\n"
-"/*! To cast through memory */\n"
-"union CastType {\n"
-"  INLINE CastType(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) {\n"
-"    u[0] = u0; u[1] = u1; u[2] = u2; u[3] = u3;\n"
-"  }\n"
-"  INLINE CastType(float f0, float f1, float f2, float f3) {\n"
-"    f[0] = f0; f[1] = f1; f[2] = f2; f[3] = f3;\n"
-"  }\n"
-"  __m128 v;\n"
-"  __m128i vi;\n"
-"  uint32_t u[4];\n"
-"  float f[4];\n"
-"};\n"
-"\n"
-"typedef genf<1,true>  genf1; // contains 3 clobbered values\n"
-"typedef genf<1,false> genf4;\n"
-"typedef genf<2,false> genf8;\n"
-"typedef genf<4,false> genf16;\n"
-"typedef genf<8,false> genf32;\n"
-"typedef geni<1,true>  geni1; // contains 3 clobbered values\n"
-"typedef geni<1,false> geni4;\n"
-"typedef geni<2,false> geni8;\n"
-"typedef geni<4,false> geni16;\n"
-"typedef geni<8,false> geni32;\n"
-"typedef genm<1,true>  genm1; // contains 3 clobbered values\n"
-"typedef genm<1,false> genm4;\n"
-"typedef genm<2,false> genm8;\n"
-"typedef genm<4,false> genm16;\n"
-"typedef genm<8,false> genm32;\n"
-"\n"
-"static INLINE uint32_t elemNum(genf1 x)  { return 1; }\n"
-"static INLINE uint32_t elemNum(genf4 x)  { return 4; }\n"
-"static INLINE uint32_t elemNum(genf8 x)  { return 8; }\n"
-"static INLINE uint32_t elemNum(genf16 x) { return 16; }\n"
-"static INLINE uint32_t elemNum(genf32 x) { return 32; }\n"
-"static INLINE uint32_t elemNum(geni1 x)  { return 1; }\n"
-"static INLINE uint32_t elemNum(geni4 x)  { return 4; }\n"
-"static INLINE uint32_t elemNum(geni8 x)  { return 8; }\n"
-"static INLINE uint32_t elemNum(geni16 x) { return 16; }\n"
-"static INLINE uint32_t elemNum(geni32 x) { return 32; }\n"
-"\n"
+"/* Some extra SSE functions */\n"
 "template<size_t i0, size_t i1, size_t i2, size_t i3>\n"
 "INLINE const __m128 shuffle(const __m128& b) {\n"
 "  return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));\n"
 "}\n"
-"\n"
 "template<size_t i> INLINE\n"
 "__m128 expand(const __m128& b) { \n"
 "  return shuffle<i, i, i, i>(b);\n"
 "}\n"
-"\n"
 "template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>\n"
 "INLINE const __m128i shuffle(const __m128i& a) {\n"
 "  return _mm_shuffle_epi32(a, _MM_SHUFFLE(index_3, index_2, index_1, index_0));\n"
 "}\n"
-"\n"
 "template<size_t index>\n"
 "INLINE const __m128i expand(const __m128i& b) {\n"
 "  return shuffle<index, index, index, index>(b);\n"
 "}\n"
 "\n"
+"/*! Base structure for scalar double word */\n"
+"union scalar_dw { uint32_t u; int32_t s; float f; };\n"
+"\n"
+"/*! Base structure for scalar mask */\n"
+"union scalar_m { uint32_t u; int32_t s; float f; };\n"
+"\n"
+"/*! Base structure for vectors 4 / 8 / 16 / 32 double words */\n"
+"template <uint32_t vectorNum>\n"
+"struct simd_dw {\n"
+"  INLINE simd_dw(void) {}\n"
+"  INLINE simd_dw(const scalar_dw &s) {\n"
+"    for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&s.f);\n"
+"  }\n"
+"  simd_dw &operator= (const scalar_dw &s) {\n"
+"    for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&s.f);\n"
+"    return *this;\n"
+"  }\n"
+"  __m128 m[vectorNum];\n"
+"};\n"
+"\n"
+"/*! Base structure for 4 / 8 / 16 / 32 booleans (m stands for \"mask\") */\n"
+"template <uint32_t vectorNum>\n"
+"struct simd_m {\n"
+"  INLINE simd_m(void) {}\n"
+"  INLINE simd_m(scalar_m s) {\n"
+"    for (uint32_t i = 0; i < vectorNum; ++i) m[i] = _mm_load1_ps(&s.f);\n"
+"  }\n"
+"  __m128 m[vectorNum];\n"
+"};\n"
+"\n"
+"/*! To cast through memory */\n"
+"union cast_dw {\n"
+"  INLINE cast_dw(uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) {\n"
+"    u[0] = u0; u[1] = u1; u[2] = u2; u[3] = u3;\n"
+"  }\n"
+"  INLINE cast_dw(int32_t s0, int32_t s1, int32_t s2, int32_t s3) {\n"
+"    s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;\n"
+"  }\n"
+"  INLINE cast_dw(float f0, float f1, float f2, float f3) {\n"
+"    f[0] = f0; f[1] = f1; f[2] = f2; f[3] = f3;\n"
+"  }\n"
+"  INLINE cast_dw(const __m128 &v) : v(v) {}\n"
+"  INLINE cast_dw(const __m128i &vi) : vi(vi) {}\n"
+"  INLINE cast_dw(void) {}\n"
+"  __m128 v;\n"
+"  __m128i vi;\n"
+"  uint32_t u[4];\n"
+"  int32_t s[4];\n"
+"  float f[4];\n"
+"};\n"
+"static const cast_dw alltrue(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\n"
+"\n"
+"/* Some convenient typedefs */\n"
+"typedef scalar_dw  simd1dw;\n"
+"typedef simd_dw<1> simd4dw;\n"
+"typedef simd_dw<2> simd8dw;\n"
+"typedef simd_dw<4> simd16dw;\n"
+"typedef simd_dw<8> simd32dw;\n"
+"typedef scalar_m   simd1m;\n"
+"typedef simd_m<1>  simd4m;\n"
+"typedef simd_m<2>  simd8m;\n"
+"typedef simd_m<4>  simd16m;\n"
+"typedef simd_m<8>  simd32m;\n"
+"\n"
+"//////////////////////////////////////////////////////////////////////////////\n"
+"// Vector instructions\n"
+"//////////////////////////////////////////////////////////////////////////////\n"
+"/* Simple function to get the number of element per vector */\n"
+"template <uint32_t vectorNum>\n"
+"INLINE uint32_t elemNum(const simd_dw<vectorNum> &x) {\n"
+"  return 4 * vectorNum;\n"
+"}\n"
+"template <uint32_t vectorNum>\n"
+"INLINE uint32_t elemNum(const simd_m<vectorNum> &x) {\n"
+"  return 4 * vectorNum;\n"
+"}\n"
+"\n"
 "/* Build an integer mask from the mask vectors */\n"
 "template <uint32_t vectorNum>\n"
-"INLINE uint32_t mask(const genm<vectorNum> v) {\n"
+"INLINE uint32_t mask(const simd_m<vectorNum> v) {\n"
 "  uint32_t m = _mm_movemask_ps(v.m[0]);\n"
 "  for (uint32_t i = 1; i < vectorNum; ++i)\n"
-"    m |= _mm_movemask_ps(v.m[i]) << (4*i);\n"
+"    m |= (_mm_movemask_ps(v.m[i]) << (4*i));\n"
 "  return m;\n"
 "}\n"
-"INLINE uint32_t mask(const genm1 &v) { return _mm_movemask_ps(v.m[0]) & 1; }\n"
 "\n"
-"#define ID(X) X\n"
+"/* Vector instructions that use sse* */\n"
 "#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\\n"
 "template <uint32_t vectorNum>\\n"
-"INLINE void NAME(DST_TYPE<vectorNum> &dst,\\n"
-"                 const SRC_TYPE<vectorNum> &v0,\\n"
-"                 const SRC_TYPE<vectorNum> &v1) {\\n"
+"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\\n"
 "  for (uint32_t i = 0; i < vectorNum; ++i)\\n"
 "    dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(v1.m[i])));\\n"
 "}\\n"
 "template <uint32_t vectorNum>\\n"
-"INLINE void NAME(DST_TYPE<vectorNum> &dst,\\n"
-"                 const SRC_TYPE<vectorNum> &v0,\\n"
-"                 const SRC_TYPE##1 &v1) {\\n"
-"  for (uint32_t i = 0; i < vectorNum; ++i)\\n"
-"    dst.m[i] = FN(INTRINSIC_NAME(FN0(v0.m[i]), FN1(expand<0>(v1.m[0]))));\\n"
+"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\\n"
+"  NAME(dst, v0, simd_dw<vectorNum>(v1));\\n"
 "}\\n"
 "template <uint32_t vectorNum>\\n"
-"INLINE void NAME(DST_TYPE<vectorNum> &dst,\\n"
-"                 const SRC_TYPE##1 &v0,\\n"
-"                 const SRC_TYPE<vectorNum> &v1) {\\n"
-"  for (uint32_t i = 0; i < vectorNum; ++i)\\n"
-"    dst.m[i] = FN(INTRINSIC_NAME(FN0(expand<0>(v0.m[0])), FN1(v1.m[i])));\\n"
+"INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\\n"
+"  NAME(dst, simd_dw<vectorNum>(v0), v1);\\n"
 "}\n"
-"\n"
-"VEC_OP(genf, genf, ADD, _mm_add_ps, ID, ID, ID);\n"
-"VEC_OP(genf, genf, SUB, _mm_sub_ps, ID, ID, ID);\n"
-"VEC_OP(genf, genf, MUL, _mm_mul_ps, ID, ID, ID);\n"
-"VEC_OP(genf, genf, DIV, _mm_div_ps, ID, ID, ID);\n"
-"VEC_OP(genm, genf, EQ, _mm_cmpeq_ps, ID, ID, ID);\n"
-"VEC_OP(genm, genf, NE, _mm_cmpneq_ps, ID, ID, ID);\n"
-"VEC_OP(genm, genf, LT, _mm_cmplt_ps, ID, ID, ID);\n"
-"VEC_OP(genm, genf, LE, _mm_cmple_ps, ID, ID, ID);\n"
-"VEC_OP(genm, genf, GT, _mm_cmpgt_ps, ID, ID, ID);\n"
-"VEC_OP(genm, genf, GE, _mm_cmpge_ps, ID, ID, ID);\n"
-"VEC_OP(geni, geni, ADD, _mm_add_epi32, ID, ID, ID);\n"
-"VEC_OP(geni, geni, SUB, _mm_sub_epi32, ID, ID, ID);\n"
-"VEC_OP(genm, geni, EQ, _mm_cmpeq_epi32, ID, ID, ID);\n"
-"VEC_OP(genm, geni, SLT, _mm_cmplt_epi32, ID, ID, ID);\n"
-"VEC_OP(genm, geni, SGT, _mm_cmpgt_epi32, ID, ID, ID);\n"
-"VEC_OP(geni, geni, OR, _mm_or_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps);\n"
-"VEC_OP(geni, geni, XOR, _mm_xor_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps);\n"
-"VEC_OP(geni, geni, AND, _mm_and_ps, _mm_castps_si128, _mm_castsi128_ps, _mm_castsi128_ps);\n"
-"VEC_OP(genm, genf, SLT, _mm_cmplt_ps, ID, ID, ID);\n"
-"VEC_OP(genm, genf, SLE, _mm_cmple_ps, ID, ID, ID);\n"
-"VEC_OP(genm, genf, SGT, _mm_cmpgt_ps, ID, ID, ID);\n"
-"VEC_OP(genm, genf, SGE, _mm_cmpge_ps, ID, ID, ID);\n"
-"\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, ADD_F, _mm_add_ps, ID, ID, ID);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, SUB_F, _mm_sub_ps, ID, ID, ID);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, MUL_F, _mm_mul_ps, ID, ID, ID);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, DIV_F, _mm_div_ps, ID, ID, ID);\n"
+"VEC_OP(simd_m<vectorNum>,  simd_dw<vectorNum>, EQ_F, _mm_cmpeq_ps, ID, ID, ID);\n"
+"VEC_OP(simd_m<vectorNum>,  simd_dw<vectorNum>, NE_F, _mm_cmpneq_ps, ID, ID, ID);\n"
+"VEC_OP(simd_m<vectorNum>,  simd_dw<vectorNum>, LT_F, _mm_cmplt_ps, ID, ID, ID);\n"
+"VEC_OP(simd_m<vectorNum>,  simd_dw<vectorNum>, LE_F, _mm_cmple_ps, ID, ID, ID);\n"
+"VEC_OP(simd_m<vectorNum>,  simd_dw<vectorNum>, GT_F, _mm_cmpgt_ps, ID, ID, ID);\n"
+"VEC_OP(simd_m<vectorNum>,  simd_dw<vectorNum>, GE_F, _mm_cmpge_ps, ID, ID, ID);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, ADD_S32, _mm_add_epi32, SI2PS, PS2SI, PS2SI);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, SUB_S32, _mm_sub_epi32, SI2PS, PS2SI, PS2SI);\n"
+"VEC_OP(simd_m<vectorNum>,  simd_dw<vectorNum>, EQ_S32, _mm_cmpeq_epi32, SI2PS, PS2SI, PS2SI);\n"
+"VEC_OP(simd_m<vectorNum>,  simd_dw<vectorNum>, LT_S32, _mm_cmplt_epi32, SI2PS, PS2SI, PS2SI);\n"
+"VEC_OP(simd_m<vectorNum>,  simd_dw<vectorNum>, GT_S32, _mm_cmpgt_epi32, SI2PS, PS2SI, PS2SI);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, OR_S32, _mm_or_ps, ID, ID, ID);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, XOR_S32, _mm_xor_ps, ID, ID, ID);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, AND_S32, _mm_and_ps, ID, ID, ID);\n"
 "#undef VEC_OP\n"
 "\n"
-"#define ICMP_VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\\n"
+"/* Vector integer operations that we can get by switching argument order */\n"
+"#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, INTRINSIC_NAME, FN, FN0, FN1)\\n"
 "template <uint32_t vectorNum>\\n"
-"INLINE void NAME(DST_TYPE<vectorNum> &dst,\\n"
-"                 const SRC_TYPE<vectorNum> &v0,\\n"
-"                 const SRC_TYPE<vectorNum> &v1) {\\n"
+"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\\n"
 "  for (uint32_t i = 0; i < vectorNum; ++i)\\n"
-"    dst.m[i] = FN(INTRINSIC_NAME(FN1(v1.m[i]), FN0(v0.m[i])));\\n"
+"    dst.m[i] = _mm_xor_ps(FN(INTRINSIC_NAME(FN1(v0.m[i]), FN0(v1.m[i]))), alltrue.v);\\n"
 "}\\n"
 "template <uint32_t vectorNum>\\n"
-"INLINE void NAME(DST_TYPE<vectorNum> &dst,\\n"
-"                 const SRC_TYPE<vectorNum> &v0,\\n"
-"                 const SRC_TYPE##1 &v1) {\\n"
-"  for (uint32_t i = 0; i < vectorNum; ++i)\\n"
-"    dst.m[i] = FN(INTRINSIC_NAME(FN1(expand<0>(v1.m[0])), FN0(v0.m[i])));\\n"
+"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\\n"
+"  NAME(dst, v0, simd_dw<vectorNum>(v1));\\n"
 "}\\n"
 "template <uint32_t vectorNum>\\n"
-"INLINE void NAME(DST_TYPE<vectorNum> &dst,\\n"
-"                 const SRC_TYPE##1 &v0,\\n"
-"                 const SRC_TYPE<vectorNum> &v1) {\\n"
-"  for (uint32_t i = 0; i < vectorNum; ++i)\\n"
-"    dst.m[i] = FN(INTRINSIC_NAME(FN1(v1.m[i]), FN0(expand<0>(v0.m[0]))));\\n"
+"INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\\n"
+"  NAME(dst, simd_dw<vectorNum>(v0), v1);\\n"
+"}\n"
+"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, GE_S32, _mm_cmplt_epi32, SI2PS, PS2SI, PS2SI);\n"
+"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, LE_S32, _mm_cmpgt_epi32, SI2PS, PS2SI, PS2SI);\n"
+"#undef VEC_OP\n"
+"\n"
+"/* Vector binary integer operations that require C */\n"
+"#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, OP, FIELD)\\n"
+"template <uint32_t vectorNum>\\n"
+"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i) {\\n"
+"    cast_dw c0(v0.m[i]), c1(v1.m[i]), d;\\n"
+"    for (uint32_t j = 0; j < 4; ++j)\\n"
+"      d.FIELD[j] = c0.FIELD[j] OP c1.FIELD[j];\\n"
+"    dst.m[i] = d.v;\\n"
+"  }\\n"
+"}\\n"
+"template <uint32_t vectorNum>\\n"
+"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\\n"
+"  NAME(dst, v0, simd_dw<vectorNum>(v1));\\n"
+"}\\n"
+"template <uint32_t vectorNum>\\n"
+"INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\\n"
+"  NAME(dst, simd_dw<vectorNum>(v0), v1);\\n"
 "}\n"
-"ICMP_VEC_OP(genm, geni, SGE, _mm_cmplt_epi32, ID, ID, ID);\n"
-"ICMP_VEC_OP(genm, geni, SLE, _mm_cmpgt_epi32, ID, ID, ID);\n"
-"#undef ICMP_VEC_OP\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, MUL_S32, *, s);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, DIV_S32, /, s);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, REM_S32, %, s);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, MUL_U32, *, u);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, DIV_U32, /, u);\n"
+"VEC_OP(simd_dw<vectorNum>, simd_dw<vectorNum>, REM_U32, %, u);\n"
+"#undef VEC_OP\n"
 "\n"
-"static const CastType alltrue(0xffffffff,0xffffffff,0xffffffff,0xffffffff);\n"
+"/* Vector compare vectors that require C */\n"
+"#define VEC_OP(DST_TYPE, SRC_TYPE, NAME, OP, FIELD)\\n"
+"template <uint32_t vectorNum>\\n"
+"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const SRC_TYPE &v1) {\\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i) {\\n"
+"    cast_dw c0(v0.m[i]), c1(v1.m[i]), d;\\n"
+"    for (uint32_t j = 0; j < 4; ++j)\\n"
+"      d.u[j] = (c0.FIELD[j] OP c1.FIELD[j]) ? ~0u : 0u;\\n"
+"    dst.m[i] = d.v;\\n"
+"  }\\n"
+"}\\n"
+"template <uint32_t vectorNum>\\n"
+"INLINE void NAME(DST_TYPE &dst, const SRC_TYPE &v0, const scalar_dw &v1) {\\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i) {\\n"
+"    cast_dw c0(v0.m[i]), d;\\n"
+"    for (uint32_t j = 0; j < 4; ++j)\\n"
+"      d.u[j] = (c0.FIELD[j] OP v1.FIELD) ? ~0u : 0u;\\n"
+"    dst.m[i] = d.v;\\n"
+"  }\\n"
+"}\\n"
+"template <uint32_t vectorNum>\\n"
+"INLINE void NAME(DST_TYPE &dst, const scalar_dw &v0, const SRC_TYPE &v1) {\\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i) {\\n"
+"    cast_dw c1(v1.m[i]), d;\\n"
+"    for (uint32_t j = 0; j < 4; ++j)\\n"
+"      d.u[j] = (v0.FIELD OP c1.FIELD[j]) ? ~0u : 0u;\\n"
+"    dst.m[i] = d.v;\\n"
+"  }\\n"
+"}\n"
+"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, LE_U32, <=, u);\n"
+"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, LT_U32, <, u);\n"
+"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, GE_U32, >=, u);\n"
+"VEC_OP(simd_m<vectorNum>, simd_dw<vectorNum>, GT_U32, >, u);\n"
+"#undef VEC_OP\n"
 "\n"
 "template <uint32_t vectorNum>\n"
-"INLINE void NE(genm<vectorNum> &dst, const geni<vectorNum> &v0, const geni<vectorNum> &v1) {\n"
-"  for (uint32_t i = 0; i < vectorNum; ++i)\n"
-"    dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[i], v1.m[i])),alltrue.v));\n"
+"INLINE void NE_S32(simd_m<vectorNum> &dst,\n"
+"                   const simd_dw<vectorNum> &v0,\n"
+"                   const scalar_dw &v1)\n"
+"{\n"
+"  NE_S32(dst, v0, simd_dw<vectorNum>(v1));\n"
 "}\n"
 "template <uint32_t vectorNum>\n"
-"INLINE void NE(genm<vectorNum> &dst, const geni<vectorNum> &v0, const geni1 &v1) {\n"
-"  for (uint32_t i = 0; i < vectorNum; ++i)\n"
-"    dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[i], v1.m[0])),alltrue.v));\n"
+"INLINE void NE_S32(simd_m<vectorNum> &dst,\n"
+"                   const scalar_dw &v0,\n"
+"                   const simd_dw<vectorNum> &v1)\n"
+"{\n"
+"  NE_S32(dst, simd_dw<vectorNum>(v0), v1);\n"
 "}\n"
 "template <uint32_t vectorNum>\n"
-"INLINE void NE(genm<vectorNum> &dst, const geni1 &v0, const geni<vectorNum> &v1) {\n"
+"INLINE void NE_S32(simd_m<vectorNum> &dst,\n"
+"                   const simd_dw<vectorNum> &v0,\n"
+"                   const simd_dw<vectorNum> &v1)\n"
+"{\n"
 "  for (uint32_t i = 0; i < vectorNum; ++i)\n"
-"    dst.m[i] = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m[0], v1.m[i])),alltrue.v));\n"
+"    dst.m[i] = _mm_xor_ps(alltrue.v, SI2PS(_mm_cmpeq_epi32(PS2SI(v0.m[i]), PS2SI(v1.m[i]))));\n"
 "}\n"
 "\n"
-"#define SCALAR_OP(TYPE, NAME, INTRINSIC_NAME)\\n"
-"INLINE void NAME(TYPE &dst, const TYPE &v0, const TYPE &v1) {\\n"
-"  dst.m[0] = INTRINSIC_NAME(v0.m[0], v1.m[0]);\\n"
-"}\n"
-"SCALAR_OP(genf1, ADD, _mm_add_ss);\n"
-"SCALAR_OP(genf1, SUB, _mm_sub_ss);\n"
-"SCALAR_OP(genf1, MUL, _mm_mul_ss);\n"
-"SCALAR_OP(genf1, DIV, _mm_div_ss);\n"
-"SCALAR_OP(geni1, ADD, _mm_add_epi32);\n"
-"SCALAR_OP(geni1, SUB, _mm_sub_epi32);\n"
-"#undef SCALAR_OP\n"
-"\n"
-"/* load from contiguous floats / integers */\n"
+"/* Load from contiguous double words */\n"
 "template <uint32_t vectorNum>\n"
-"INLINE void LOAD(genf<vectorNum> &dst, const char *ptr) {\n"
+"INLINE void LOAD(simd_dw<vectorNum> &dst, const char *ptr) {\n"
 "  for (uint32_t i = 0; i < vectorNum; ++i)\n"
 "    dst.m[i] = _mm_loadu_ps((const float*) ptr + 4*i);\n"
 "}\n"
-"INLINE void LOAD(genf1 &dst, const char *ptr) {\n"
-"  dst.m[0] = _mm_load_ss((const float*) ptr);\n"
-"}\n"
-"template <uint32_t vectorNum>\n"
-"INLINE void LOAD(geni<vectorNum> &dst, const char *ptr) {\n"
-"  for (uint32_t i = 0; i < vectorNum; ++i)\n"
-"    dst.m[i] = _mm_castps_si128(_mm_loadu_ps((const float*) ptr + 4*i));\n"
-"}\n"
-"INLINE void LOAD(geni1 &dst, const char *ptr) {\n"
-"  dst.m[0] = _mm_castps_si128(_mm_load_ss((const float*) ptr));\n"
-"}\n"
 "\n"
-"/* store to contiguous floats / integers */\n"
+"/* Store to contiguous double words */\n"
 "template <uint32_t vectorNum>\n"
-"INLINE void STORE(const genf<vectorNum> &src, char *ptr) {\n"
+"INLINE void STORE(const simd_dw<vectorNum> &src, char *ptr) {\n"
 "  for (uint32_t i = 0; i < vectorNum; ++i)\n"
 "    _mm_storeu_ps((float*) ptr + 4*i, src.m[i]);\n"
 "}\n"
-"INLINE void STORE(genf1 &src, char *ptr) {\n"
-"  _mm_store_ss((float*) ptr, src.m[0]);\n"
-"}\n"
-"template <uint32_t vectorNum>\n"
-"INLINE void STORE(const geni<vectorNum> &src, char *ptr) {\n"
-"  for (uint32_t i = 0; i < vectorNum; ++i)\n"
-"     _mm_storeu_ps((float*) ptr + 4*i, _mm_castsi128_ps(src.m[i]));\n"
-"}\n"
-"INLINE void STORE(const geni1 &src, char *ptr) {\n"
-"  _mm_store_ss((float*) ptr, _mm_castsi128_ps(src.m[0]));\n"
-"}\n"
 "\n"
 "/* Load immediates */\n"
 "template <uint32_t vectorNum>\n"
-"INLINE void LOADI(genf<vectorNum> &dst, float f) {\n"
+"INLINE void LOADI(simd_dw<vectorNum> &dst, float f) {\n"
 "  for (uint32_t i = 0; i < vectorNum; ++i)\n"
 "    dst.m[i] = _mm_load1_ps(&f);\n"
 "}\n"
-"INLINE void LOADI(genf1 &dst, float f) { dst.m[0] = _mm_load_ss(&f); }\n"
-"template <uint32_t vectorNum>\n"
-"INLINE void LOADI(geni<vectorNum> &dst, uint32_t u) {\n"
-"  union { float f; uint32_t u; } cast;\n"
-"  cast.u = u;\n"
-"  for (uint32_t i = 0; i < vectorNum; ++i)\n"
-"    dst.m[i] = _mm_castps_si128(_mm_load1_ps(&cast.f));\n"
-"}\n"
-"INLINE void LOADI(geni1 &dst, uint32_t u) {\n"
-"  union { float f; uint32_t u; } cast;\n"
-"  cast.u = u;\n"
-"  dst.m[0] = _mm_castps_si128(_mm_load_ss(&cast.f));\n"
-"}\n"
 "\n"
 "/* Scatter */\n"
-"#define SCATTER_OP(TYPE, FN)\\n"
-"template <uint32_t vectorNum>\\n"
-"INLINE void SCATTER(const TYPE<vectorNum> &value,\\n"
-"                    const geni<vectorNum> &offset,\\n"
-"                    char *base_address) {\\n"
-"  for (uint32_t i = 0; i < vectorNum; ++i) {\\n"
-"    const int v0 = _mm_extract_epi32(FN(value.m[i]), 0);\\n"
-"    const int v1 = _mm_extract_epi32(FN(value.m[i]), 1);\\n"
-"    const int v2 = _mm_extract_epi32(FN(value.m[i]), 2);\\n"
-"    const int v3 = _mm_extract_epi32(FN(value.m[i]), 3);\\n"
-"    const int o0 = _mm_extract_epi32(offset.m[i], 0);\\n"
-"    const int o1 = _mm_extract_epi32(offset.m[i], 1);\\n"
-"    const int o2 = _mm_extract_epi32(offset.m[i], 2);\\n"
-"    const int o3 = _mm_extract_epi32(offset.m[i], 3);\\n"
-"    *(int*)(base_address + o0) = v0;\\n"
-"    *(int*)(base_address + o1) = v1;\\n"
-"    *(int*)(base_address + o2) = v2;\\n"
-"    *(int*)(base_address + o3) = v3;\\n"
-"  }\\n"
-"}\\n"
-"INLINE void SCATTER(const TYPE##1 &value, const geni1 &offset, char *base_address) {\\n"
-"  const int v0 = _mm_extract_epi32(FN(value.m[0]), 0);\\n"
-"  const int o0 = _mm_extract_epi32(offset.m[0], 0);\\n"
-"  *(int*)(base_address + o0) = v0;\\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void SCATTER(const simd_dw<vectorNum> &value,\n"
+"                    const simd_dw<vectorNum> &offset,\n"
+"                    char *base_address) {\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i) {\n"
+"    const int v0 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 0);\n"
+"    const int v1 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 1);\n"
+"    const int v2 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 2);\n"
+"    const int v3 = _mm_extract_epi32(_mm_castps_si128(value.m[i]), 3);\n"
+"    const int o0 = _mm_extract_epi32(offset.m[i], 0);\n"
+"    const int o1 = _mm_extract_epi32(offset.m[i], 1);\n"
+"    const int o2 = _mm_extract_epi32(offset.m[i], 2);\n"
+"    const int o3 = _mm_extract_epi32(offset.m[i], 3);\n"
+"    *(int*)(base_address + o0) = v0;\n"
+"    *(int*)(base_address + o1) = v1;\n"
+"    *(int*)(base_address + o2) = v2;\n"
+"    *(int*)(base_address + o3) = v3;\n"
+"  }\n"
 "}\n"
-"SCATTER_OP(genf, _mm_castps_si128)\n"
-"SCATTER_OP(geni, ID)\n"
-"#undef SCATTER_OP\n"
 "\n"
 "/* Gather */\n"
-"#define GATHER_OP(TYPE, FN)\\n"
-"template <uint32_t vectorNum>\\n"
-"INLINE void GATHER(TYPE<vectorNum> &dst,\\n"
-"                   const geni<vectorNum> &offset,\\n"
-"                   char *base_address) {\\n"
-"  for (uint32_t i = 0; i < vectorNum; ++i) {\\n"
-"    const int o0 = _mm_extract_epi32(offset.m[i], 0);\\n"
-"    const int o1 = _mm_extract_epi32(offset.m[i], 1);\\n"
-"    const int o2 = _mm_extract_epi32(offset.m[i], 2);\\n"
-"    const int o3 = _mm_extract_epi32(offset.m[i], 3);\\n"
-"    const int v0 = *(int*)(base_address + o0);\\n"
-"    const int v1 = *(int*)(base_address + o1);\\n"
-"    const int v2 = *(int*)(base_address + o2);\\n"
-"    const int v3 = *(int*)(base_address + o3);\\n"
-"    _mm_insert_epi32(FN(dst.m[i]), v0, 0);\\n"
-"    _mm_insert_epi32(FN(dst.m[i]), v1, 1);\\n"
-"    _mm_insert_epi32(FN(dst.m[i]), v2, 2);\\n"
-"    _mm_insert_epi32(FN(dst.m[i]), v3, 3);\\n"
-"  }\\n"
-"}\\n"
-"INLINE void GATHER(TYPE##1 &dst, const geni1 &offset, char *base_address) {\\n"
-"    const int o0 = _mm_extract_epi32(offset.m[0], 0);\\n"
-"    const int v0 = *(int*)(base_address + o0);\\n"
-"    _mm_insert_epi32(FN(dst.m[0]), v0, 0);\\n"
+"template <uint32_t vectorNum>\n"
+"INLINE void GATHER(simd_dw<vectorNum> &dst,\n"
+"                   const simd_dw<vectorNum> &offset,\n"
+"                   char *base_address) {\n"
+"  for (uint32_t i = 0; i < vectorNum; ++i) {\n"
+"    const int o0 = _mm_extract_epi32(offset.m[i], 0);\n"
+"    const int o1 = _mm_extract_epi32(offset.m[i], 1);\n"
+"    const int o2 = _mm_extract_epi32(offset.m[i], 2);\n"
+"    const int o3 = _mm_extract_epi32(offset.m[i], 3);\n"
+"    const int v0 = *(int*)(base_address + o0);\n"
+"    const int v1 = *(int*)(base_address + o1);\n"
+"    const int v2 = *(int*)(base_address + o2);\n"
+"    const int v3 = *(int*)(base_address + o3);\n"
+"    _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v0, 0);\n"
+"    _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v1, 1);\n"
+"    _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v2, 2);\n"
+"    _mm_insert_epi32(_mm_castps_si128(dst.m[i]), v3, 3);\n"
+"  }\n"
 "}\n"
-"GATHER_OP(genf, _mm_castps_si128)\n"
-"GATHER_OP(geni, ID)\n"
-"#undef GATHER_OP\n"
 "\n"
+"//////////////////////////////////////////////////////////////////////////////\n"
+"// Scalar instructions\n"
+"//////////////////////////////////////////////////////////////////////////////\n"
+"INLINE uint32_t elemNum(const scalar_dw &x) { return 1; }\n"
+"INLINE uint32_t elemNum(const scalar_m &x) { return 1; }\n"
+"INLINE uint32_t mask(const scalar_m &v) { return v.u ? 1 : 0; }\n"
+"INLINE void ADD_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f + v1.f; }\n"
+"INLINE void SUB_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f - v1.f; }\n"
+"INLINE void MUL_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f * v1.f; }\n"
+"INLINE void DIV_F(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.f = v0.f / v1.f; }\n"
+"INLINE void EQ_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f == v1.f ? ~0 : 0); }\n"
+"INLINE void NE_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f != v1.f ? ~0 : 0); }\n"
+"INLINE void LE_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f <= v1.f ? ~0 : 0); }\n"
+"INLINE void LT_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f < v1.f ? ~0 : 0); }\n"
+"INLINE void GE_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f >= v1.f ? ~0 : 0); }\n"
+"INLINE void GT_F(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.f > v1.f ? ~0 : 0); }\n"
+"INLINE void ADD_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s + v1.s; }\n"
+"INLINE void SUB_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s - v1.s; }\n"
+"INLINE void MUL_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s * v1.s; }\n"
+"INLINE void DIV_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s / v1.s; }\n"
+"INLINE void REM_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s % v1.s; }\n"
+"INLINE void MUL_U32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.u = v0.u * v1.u; }\n"
+"INLINE void DIV_U32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.u = v0.u / v1.u; }\n"
+"INLINE void REM_U32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.u = v0.u % v1.u; }\n"
+"INLINE void EQ_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s == v1.s ? ~0 : 0); }\n"
+"INLINE void NE_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s != v1.s ? ~0 : 0); }\n"
+"INLINE void LE_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s <= v1.s ? ~0 : 0); }\n"
+"INLINE void LT_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s < v1.s ? ~0 : 0); }\n"
+"INLINE void GE_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s >= v1.s ? ~0 : 0); }\n"
+"INLINE void GT_S32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.s > v1.s ? ~0 : 0); }\n"
+"INLINE void XOR_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s ^ v1.s; }\n"
+"INLINE void OR_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s | v1.s; }\n"
+"INLINE void AND_S32(scalar_dw &dst, scalar_dw v0, scalar_dw v1) { dst.s = v0.s & v1.s; }\n"
+"INLINE void LE_U32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.u <= v1.u ? ~0 : 0); }\n"
+"INLINE void LT_U32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.u < v1.u ? ~0 : 0); }\n"
+"INLINE void GE_U32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.u >= v1.u ? ~0 : 0); }\n"
+"INLINE void GT_U32(scalar_m &dst, scalar_dw v0, scalar_dw v1) { dst.u = (v0.u > v1.u ? ~0 : 0); }\n"
+"INLINE void LOAD(scalar_dw &dst, const char *ptr) { dst.u = *(const uint32_t *) ptr; }\n"
+"INLINE void STORE(scalar_dw src, char *ptr) { *(uint32_t *) ptr = src.u; }\n"
+"INLINE void LOADI(scalar_dw &dst, uint32_t u) { dst.u = u; }\n"
+"INLINE void SCATTER(scalar_dw value, scalar_dw offset, char *base) { *(uint32_t*)(base + offset.u) = value.u; }\n"
+"INLINE void GATHER(scalar_dw &dst, scalar_dw offset, char *base) { dst.u = *(uint32_t*)(base + offset.u); }\n"
+"\n"
+"//////////////////////////////////////////////////////////////////////////////\n"
+"// Identical instructions are forwarded\n"
+"//////////////////////////////////////////////////////////////////////////////\n"
+"\n"
+"#define ADD_U32 ADD_S32\n"
+"#define SUB_U32 SUB_S32\n"
+"#define XOR_U32 XOR_S32\n"
+"#define OR_U32 OR_S32\n"
+"#define AND_U32 AND_S32\n"
+"#define EQ_U32 EQ_S32\n"
+"#define NE_U32 NE_S32\n"
+"\n"
+"#undef PS2SI\n"
+"#undef SI2PS\n"
 "#undef ID\n"
 "#undef INLINE\n"
 "\n"
diff --git a/backend/src/utest/utest_vector.cpp b/backend/src/utest/utest_vector.cpp
index 78bfcc0..e6db38b 100644
--- a/backend/src/utest/utest_vector.cpp
+++ b/backend/src/utest/utest_vector.cpp
@@ -20,10 +20,12 @@
 #include "backend/sim/sim_vector.h"
 #include "utest/utest.hpp"
 
+
 static INLINE bool ok(float x, float y) {
   return fabs(x-y) / (1.f + std::max(fabs(x), fabs(y))) < 1.e-6;
 }
-static INLINE bool ok(int x, int y) { return x == y; }
+static INLINE bool ok(int32_t x, int32_t y) { return x == y; }
+static INLINE bool ok(uint32_t x, uint32_t y) { return x == y; }
 
 #define CHECK_BINARY_OP(TYPE,FN,OP,DST,SRC0,SRC1,ELEM0,ELEM1)\
   do {\
@@ -38,60 +40,304 @@ static INLINE bool ok(int x, int y) { return x == y; }
 
 static void utestFP(void)
 {
-  genf1 _0, _4, _5;
-  genf16 _1, _2, _3;
-  const float data[32] = {1.f,1.f,2.f, 3.f, 4.f, 5.f, 6.f, 7.f,
+  simd1dw _0, _4, _5;
+  simd16dw _1, _2, _3;
+  const float data[32] = {1.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f,
                           8.f,9.f,10.f,11.f,12.f,13.f,14.f,15.f,
                           8.f,9.f,10.f,11.f,12.f,13.f,14.f,15.f,
-                          1.f,1.f,2.f, 3.f, 4.f, 5.f, 6.f, 7.f};
-
-  LOAD(_0, (const char *) (data+4));
-  LOAD(_4, (const char *) (data+5));
-  LOAD(_1, (const char *) (data));
-  LOAD(_2, (const char *) (data));
-  CHECK_BINARY_OP(float,MUL,*,_3,_2,_1,data[i],data[i]);
-  CHECK_BINARY_OP(float,DIV,/,_3,_2,_1,data[i],data[i]);
-  CHECK_BINARY_OP(float,ADD,+,_3,_2,_1,data[i],data[i]);
-  CHECK_BINARY_OP(float,SUB,-,_3,_2,_1,data[i],data[i]);
-  CHECK_BINARY_OP(float,MUL,*,_3,_2,_0,data[i],data[4]);
-  CHECK_BINARY_OP(float,DIV,/,_3,_2,_0,data[i],data[4]);
-  CHECK_BINARY_OP(float,ADD,+,_3,_2,_0,data[i],data[4]);
-  CHECK_BINARY_OP(float,SUB,-,_3,_2,_0,data[i],data[4]);
-  CHECK_BINARY_OP(float,MUL,*,_3,_2,_0,data[i],data[4]);
-  CHECK_BINARY_OP(float,DIV,/,_3,_2,_0,data[i],data[4]);
-  CHECK_BINARY_OP(float,ADD,+,_3,_2,_0,data[i],data[4]);
-  CHECK_BINARY_OP(float,SUB,-,_3,_2,_0,data[i],data[4]);
-  CHECK_BINARY_OP(float,MUL,*,_5,_4,_0,data[5],data[4]);
-  CHECK_BINARY_OP(float,DIV,/,_5,_4,_0,data[5],data[4]);
-  CHECK_BINARY_OP(float,ADD,+,_5,_4,_0,data[5],data[4]);
-  CHECK_BINARY_OP(float,SUB,-,_5,_4,_0,data[5],data[4]);
+                          1.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f};
+  for (uint32_t i = 0; i < 32; ++i) {
+    const int index0 = rand() % 32;
+    const int index1 = rand() % 16;
+    const int index2 = rand() % 16;
+    const int index4 = rand() % 32;
+    LOAD(_0, (const char *) (data+index0));
+    LOAD(_1, (const char *) (data+index1));
+    LOAD(_2, (const char *) (data+index2));
+    LOAD(_4, (const char *) (data+index4));
+    CHECK_BINARY_OP(float,MUL_F,*,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(float,DIV_F,/,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(float,ADD_F,+,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(float,SUB_F,-,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(float,MUL_F,*,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(float,DIV_F,/,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(float,ADD_F,+,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(float,SUB_F,-,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(float,MUL_F,*,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(float,DIV_F,/,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(float,ADD_F,+,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(float,SUB_F,-,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(float,MUL_F,*,_5,_4,_0,data[index4],data[index0]);
+    CHECK_BINARY_OP(float,DIV_F,/,_5,_4,_0,data[index4],data[index0]);
+    CHECK_BINARY_OP(float,ADD_F,+,_5,_4,_0,data[index4],data[index0]);
+    CHECK_BINARY_OP(float,SUB_F,-,_5,_4,_0,data[index4],data[index0]);
+  }
+}
+
+static void utestINT32(void)
+{
+  simd1dw _0, _4, _5;
+  simd16dw _1, _2, _3;
+  const int32_t data[32] = {-1,1,-2,-3,4,-5,6,7,-8,9,10,11,12,13,14,15,8,
+                            9,10,11,12,-13,14,-15,-1,1,-2,3,4,5,6,7};
+  for (uint32_t i = 0; i < 32; ++i) {
+    const int index0 = rand() % 32;
+    const int index1 = rand() % 16;
+    const int index2 = rand() % 16;
+    const int index4 = rand() % 32;
+    LOAD(_0, (const char *) (data+index0));
+    LOAD(_1, (const char *) (data+index1));
+    LOAD(_2, (const char *) (data+index2));
+    LOAD(_4, (const char *) (data+index4));
+    CHECK_BINARY_OP(int32_t,ADD_S32,+,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(int32_t,SUB_S32,-,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(int32_t,MUL_S32,*,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(int32_t,DIV_S32,/,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(int32_t,REM_S32,%,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(int32_t,AND_S32,&,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(int32_t,XOR_S32,^,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(int32_t,OR_S32, |,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(int32_t,ADD_S32,+,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(int32_t,SUB_S32,-,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(int32_t,MUL_S32,*,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(int32_t,DIV_S32,/,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(int32_t,REM_S32,%,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(int32_t,AND_S32,&,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(int32_t,XOR_S32,^,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(int32_t,OR_S32, |,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(int32_t,ADD_S32,+,_5,_4,_0,data[index4],data[index0]);
+    CHECK_BINARY_OP(int32_t,SUB_S32,-,_5,_4,_0,data[index4],data[index0]);
+    CHECK_BINARY_OP(int32_t,MUL_S32,*,_5,_4,_0,data[index4],data[index0]);
+    CHECK_BINARY_OP(int32_t,DIV_S32,/,_5,_4,_0,data[index4],data[index0]);
+    CHECK_BINARY_OP(int32_t,REM_S32,%,_5,_4,_0,data[index4],data[index0]);
+    CHECK_BINARY_OP(int32_t,AND_S32,&,_5,_4,_0,data[index4],data[index0]);
+    CHECK_BINARY_OP(int32_t,XOR_S32,^,_5,_4,_0,data[index4],data[index0]);
+    CHECK_BINARY_OP(int32_t,OR_S32, |,_5,_4,_0,data[index4],data[index0]);
+  }
+}
+
+static void utestUINT32(void)
+{
+  simd1dw _0, _4, _5;
+  simd16dw _1, _2, _3;
+  const uint32_t data[32] = {1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,8,
+                             9,10,11,12,13,14,15,1,1,2,3,4,5,6,7};
+  for (uint32_t i = 0; i < 32; ++i) {
+    const int index0 = rand() % 32;
+    const int index1 = rand() % 16;
+    const int index2 = rand() % 16;
+    const int index4 = rand() % 32;
+    LOAD(_0, (const char *) (data+index0));
+    LOAD(_1, (const char *) (data+index1));
+    LOAD(_2, (const char *) (data+index2));
+    LOAD(_4, (const char *) (data+index4));
+    CHECK_BINARY_OP(uint32_t,ADD_U32,+,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(uint32_t,SUB_U32,-,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(uint32_t,MUL_U32,*,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(uint32_t,DIV_U32,/,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(uint32_t,REM_U32,%,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(uint32_t,AND_U32,&,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(uint32_t,XOR_U32,^,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(uint32_t,OR_U32, |,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_BINARY_OP(uint32_t,ADD_U32,+,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(uint32_t,SUB_U32,-,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(uint32_t,MUL_U32,*,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(uint32_t,DIV_U32,/,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(uint32_t,REM_U32,%,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(uint32_t,AND_U32,&,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(uint32_t,XOR_U32,^,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(uint32_t,OR_U32, |,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_BINARY_OP(uint32_t,ADD_U32,+,_5,_4,_0,data[index4],data[index0]);
+    CHECK_BINARY_OP(uint32_t,SUB_U32,-,_5,_4,_0,data[index4],data[index0]);
+    CHECK_BINARY_OP(uint32_t,MUL_U32,*,_5,_4,_0,data[index4],data[index0]);
+    CHECK_BINARY_OP(uint32_t,DIV_U32,/,_5,_4,_0,data[index4],data[index0]);
+    CHECK_BINARY_OP(uint32_t,REM_U32,%,_5,_4,_0,data[index4],data[index0]);
+    CHECK_BINARY_OP(uint32_t,AND_U32,&,_5,_4,_0,data[index4],data[index0]);
+    CHECK_BINARY_OP(uint32_t,XOR_U32,^,_5,_4,_0,data[index4],data[index0]);
+    CHECK_BINARY_OP(uint32_t,OR_U32, |,_5,_4,_0,data[index4],data[index0]);
+  }
+}
+#undef CHECK_BINARY_OP
+
+#define CHECK_CMP_OP(FN,OP,DST,SRC0,SRC1,ELEM0,ELEM1)\
+  do {\
+    FN(DST, SRC0, SRC1);\
+    uint32_t m = 0;\
+    for (uint32_t i = 0; i < elemNum(DST); ++i)\
+      m |= (((ELEM0 OP ELEM1) ? 1 : 0) << i);\
+    GBE_ASSERT(m == mask(DST));\
+  } while (0);
+
+static void utestUINT32Cmp(void)
+{
+  simd1dw _0, _4;
+  simd16dw _1, _2;
+  simd8dw _6, _7;
+  simd1m _5;
+  simd16m _3;
+  simd8m _8;
+  const uint32_t data[64] = {11,12,13,14,15,8,1,1,2,3,4,5,6,7,8,9,10,
+                             9,10,11,12,13,14,15,1,1,2,3,4,5,6,7,
+                             10,11,12,13,14,15,8,1,1,2,3,4,5,6,7,8,9,
+                             9,10,11,12,13,14,15,1,1,2,3,4,5,6,7};
+  for (uint32_t i = 0; i < 32; ++i) {
+    const int index0 = rand() % 32;
+    const int index1 = rand() % 16;
+    const int index2 = rand() % 16;
+    const int index4 = rand() % 32;
+    const int index6 = rand() % 16;
+    const int index7 = rand() % 32;
+    LOAD(_0, (const char *) (data+index0));
+    LOAD(_1, (const char *) (data+index1));
+    LOAD(_2, (const char *) (data+index2));
+    LOAD(_4, (const char *) (data+index4));
+    LOAD(_6, (const char *) (data+index6));
+    LOAD(_7, (const char *) (data+index7));
+    CHECK_CMP_OP(GE_U32,>=,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_CMP_OP(LE_U32,<=,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_CMP_OP(GT_U32,>,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_CMP_OP(LT_U32,<,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_CMP_OP(EQ_U32,==,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_CMP_OP(NE_U32,!=,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_CMP_OP(GE_U32,>=,_8,_7,_6,data[i+index7],data[i+index6]);
+    CHECK_CMP_OP(LE_U32,<=,_8,_7,_6,data[i+index7],data[i+index6]);
+    CHECK_CMP_OP(GT_U32,>,_8,_7,_6,data[i+index7],data[i+index6]);
+    CHECK_CMP_OP(LT_U32,<,_8,_7,_6,data[i+index7],data[i+index6]);
+    CHECK_CMP_OP(EQ_U32,==,_8,_7,_6,data[i+index7],data[i+index6]);
+    CHECK_CMP_OP(NE_U32,!=,_8,_7,_6,data[i+index7],data[i+index6]);
+    CHECK_CMP_OP(GE_U32,>=,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_CMP_OP(LE_U32,<=,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_CMP_OP(GT_U32,>,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_CMP_OP(LT_U32,<,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_CMP_OP(EQ_U32,==,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_CMP_OP(NE_U32,!=,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_CMP_OP(GE_U32,>=,_5,_4,_0,data[index4],data[index0]);
+    CHECK_CMP_OP(LE_U32,<=,_5,_4,_0,data[index4],data[index0]);
+    CHECK_CMP_OP(GT_U32,>,_5,_4,_0,data[index4],data[index0]);
+    CHECK_CMP_OP(LT_U32,<,_5,_4,_0,data[index4],data[index0]);
+    CHECK_CMP_OP(EQ_U32,==,_5,_4,_0,data[index4],data[index0]);
+    CHECK_CMP_OP(NE_U32,!=,_5,_4,_0,data[index4],data[index0]);
+  }
+}
+
+static void utestINT32Cmp(void)
+{
+  simd1dw _0, _4;
+  simd16dw _1, _2;
+  simd8dw _6, _7;
+  simd1m _5;
+  simd16m _3;
+  simd8m _8;
+  const int32_t data[64] = {-11,-12,13,14,-15,8,-1,-1,2,3,4,5,-6,7,8,9,10,
+                            9,10,-11,12,-13,14,15,1,1,2,-3,4,-5,6,7,
+                            10,11,-12,13,14,15,-8,1,1,2,-3,-4,5,-6,7,8,9,
+                            9,10,11,12,-13,14,15,-1,-1,-2,-3,-4,5,6,7};
+
+  for (uint32_t i = 0; i < 32; ++i) {
+    const int index0 = rand() % 32;
+    const int index1 = rand() % 16;
+    const int index2 = rand() % 16;
+    const int index4 = rand() % 32;
+    const int index6 = rand() % 16;
+    const int index7 = rand() % 32;
+    LOAD(_0, (const char *) (data+index0));
+    LOAD(_1, (const char *) (data+index1));
+    LOAD(_2, (const char *) (data+index2));
+    LOAD(_4, (const char *) (data+index4));
+    LOAD(_6, (const char *) (data+index6));
+    LOAD(_7, (const char *) (data+index7));
+    CHECK_CMP_OP(GE_S32,>=,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_CMP_OP(LE_S32,<=,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_CMP_OP(GT_S32,>,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_CMP_OP(LT_S32,<,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_CMP_OP(EQ_S32,==,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_CMP_OP(NE_S32,!=,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_CMP_OP(GE_S32,>=,_8,_7,_6,data[i+index7],data[i+index6]);
+    CHECK_CMP_OP(LE_S32,<=,_8,_7,_6,data[i+index7],data[i+index6]);
+    CHECK_CMP_OP(GT_S32,>,_8,_7,_6,data[i+index7],data[i+index6]);
+    CHECK_CMP_OP(LT_S32,<,_8,_7,_6,data[i+index7],data[i+index6]);
+    CHECK_CMP_OP(EQ_S32,==,_8,_7,_6,data[i+index7],data[i+index6]);
+    CHECK_CMP_OP(NE_S32,!=,_8,_7,_6,data[i+index7],data[i+index6]);
+    CHECK_CMP_OP(GE_S32,>=,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_CMP_OP(LE_S32,<=,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_CMP_OP(GT_S32,>,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_CMP_OP(LT_S32,<,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_CMP_OP(EQ_S32,==,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_CMP_OP(NE_S32,!=,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_CMP_OP(GE_S32,>=,_5,_4,_0,data[index4],data[index0]);
+    CHECK_CMP_OP(LE_S32,<=,_5,_4,_0,data[index4],data[index0]);
+    CHECK_CMP_OP(GT_S32,>,_5,_4,_0,data[index4],data[index0]);
+    CHECK_CMP_OP(LT_S32,<,_5,_4,_0,data[index4],data[index0]);
+    CHECK_CMP_OP(EQ_S32,==,_5,_4,_0,data[index4],data[index0]);
+    CHECK_CMP_OP(NE_S32,!=,_5,_4,_0,data[index4],data[index0]);
+  }
 }
 
-static void utestInt(void)
+static void utestFPCmp(void)
 {
-  geni1 _0, _4, _5;
-  geni16 _1, _2, _3;
-  const int data[32] = {1,1,2, 3, 4, 5, 6, 7,
-                        8,9,10,11,12,13,14,15,
-                        8,9,10,11,12,13,14,15,
-                        1,1,2, 3, 4, 5, 6, 7};
-  LOAD(_0, (const char *) (data+4));
-  LOAD(_4, (const char *) (data+5));
-  LOAD(_1, (const char *) (data));
-  LOAD(_2, (const char *) (data));
-  CHECK_BINARY_OP(int,ADD,+,_3,_2,_1,data[i],data[i]);
-  CHECK_BINARY_OP(int,SUB,-,_3,_2,_1,data[i],data[i]);
-  CHECK_BINARY_OP(int,ADD,+,_3,_2,_0,data[i],data[4]);
-  CHECK_BINARY_OP(int,SUB,-,_3,_2,_0,data[i],data[4]);
-  CHECK_BINARY_OP(int,ADD,+,_5,_4,_0,data[5],data[4]);
-  CHECK_BINARY_OP(int,SUB,-,_5,_4,_0,data[5],data[4]);
+  simd1dw _0, _4;
+  simd16dw _1, _2;
+  simd8dw _6, _7;
+  simd1m _5;
+  simd16m _3;
+  simd8m _8;
+  const float data[64] = {1.f,-1.f,2.f,3.f,4.f,5.f,-6.f,7.f,
+                          8.f,9.f,10.f,11.f,12.f,-13.f,14.f,15.f,
+                          -8.f,9.f,-10.f,11.f,-12.f,13.f,-14.f,15.f,
+                          1.f,1.f,2.f,3.f,4.f,5.f,6.f,-7.f,
+                          8.f,9.f,10.f,11.f,12.f,-13.f,14.f,15.f,
+                          -8.f,9.f,-10.f,11.f,-12.f,13.f,-14.f,15.f,
+                          8.f,9.f,10.f,11.f,12.f,-13.f,14.f,15.f};
 
+  for (uint32_t i = 0; i < 32; ++i) {
+    const int index0 = rand() % 32;
+    const int index1 = rand() % 16;
+    const int index2 = rand() % 16;
+    const int index4 = rand() % 32;
+    const int index6 = rand() % 16;
+    const int index7 = rand() % 32;
+    LOAD(_0, (const char *) (data+index0));
+    LOAD(_1, (const char *) (data+index1));
+    LOAD(_2, (const char *) (data+index2));
+    LOAD(_4, (const char *) (data+index4));
+    LOAD(_6, (const char *) (data+index6));
+    LOAD(_7, (const char *) (data+index7));
+    CHECK_CMP_OP(GE_F,>=,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_CMP_OP(LE_F,<=,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_CMP_OP(GT_F,>,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_CMP_OP(LT_F,<,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_CMP_OP(EQ_F,==,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_CMP_OP(NE_F,!=,_3,_2,_1,data[i+index2],data[i+index1]);
+    CHECK_CMP_OP(GE_F,>=,_8,_7,_6,data[i+index7],data[i+index6]);
+    CHECK_CMP_OP(LE_F,<=,_8,_7,_6,data[i+index7],data[i+index6]);
+    CHECK_CMP_OP(GT_F,>,_8,_7,_6,data[i+index7],data[i+index6]);
+    CHECK_CMP_OP(LT_F,<,_8,_7,_6,data[i+index7],data[i+index6]);
+    CHECK_CMP_OP(EQ_F,==,_8,_7,_6,data[i+index7],data[i+index6]);
+    CHECK_CMP_OP(NE_F,!=,_8,_7,_6,data[i+index7],data[i+index6]);
+    CHECK_CMP_OP(GE_F,>=,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_CMP_OP(LE_F,<=,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_CMP_OP(GT_F,>,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_CMP_OP(LT_F,<,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_CMP_OP(EQ_F,==,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_CMP_OP(NE_F,!=,_3,_2,_0,data[i+index2],data[index0]);
+    CHECK_CMP_OP(GE_F,>=,_5,_4,_0,data[index4],data[index0]);
+    CHECK_CMP_OP(LE_F,<=,_5,_4,_0,data[index4],data[index0]);
+    CHECK_CMP_OP(GT_F,>,_5,_4,_0,data[index4],data[index0]);
+    CHECK_CMP_OP(LT_F,<,_5,_4,_0,data[index4],data[index0]);
+    CHECK_CMP_OP(EQ_F,==,_5,_4,_0,data[index4],data[index0]);
+    CHECK_CMP_OP(NE_F,!=,_5,_4,_0,data[index4],data[index0]);
+  }
 }
+#undef CHECK_CMP_OP
 
 static void utestVector(void)
 {
   UTEST_EXPECT_SUCCESS(utestFP());
-  UTEST_EXPECT_SUCCESS(utestInt());
+  UTEST_EXPECT_SUCCESS(utestINT32());
+  UTEST_EXPECT_SUCCESS(utestUINT32());
+  UTEST_EXPECT_SUCCESS(utestFPCmp());
+  UTEST_EXPECT_SUCCESS(utestINT32Cmp());
+  UTEST_EXPECT_SUCCESS(utestUINT32Cmp());
 }
 
 UTEST_REGISTER(utestVector)
-- 
2.7.4