From aaeb07989ee30c589b034a551f71e23e7b8056b2 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Tue, 20 Sep 2016 17:14:54 -0500
Subject: [PATCH] swr: [rasterizer jitter] fixes for icc in vs2015 compat mode

- Move most jitter functionality into SwrJit namespace
- Avoid global "using namespace llvm" in headers

Signed-off-by: Tim Rowley <timothy.o.rowley@intel.com>
---
 .../swr/rasterizer/jitter/JitManager.cpp      |    1 +
 .../swr/rasterizer/jitter/JitManager.h        |   35 +-
 .../swr/rasterizer/jitter/blend_jit.cpp       |    3 +
 .../drivers/swr/rasterizer/jitter/builder.cpp |   84 +-
 .../drivers/swr/rasterizer/jitter/builder.h   |   72 +-
 .../swr/rasterizer/jitter/builder_misc.cpp    | 2576 +++++++++--------
 .../swr/rasterizer/jitter/fetch_jit.cpp       |    2 +
 .../jitter/scripts/gen_llvm_ir_macros.py      |   39 +-
 .../jitter/scripts/gen_llvm_types.py          |   25 +-
 .../swr/rasterizer/jitter/streamout_jit.cpp   |    3 +
 src/gallium/drivers/swr/swr_shader.cpp        |    2 +
 src/gallium/drivers/swr/swr_tex_sample.cpp    |    1 +
 12 files changed, 1439 insertions(+), 1404 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index 4540105ed9a..cc773d75cc2 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -72,6 +72,7 @@
 #endif
 
 using namespace llvm;
+using namespace SwrJit;
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Contructor for JitManager.
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
index f474143842a..7c0eaa9c31d 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -86,7 +86,6 @@ using PassManager = llvm::legacy::PassManager;
 
 #pragma pop_macro("DEBUG")
 
-using namespace llvm;
 //////////////////////////////////////////////////////////////////////////
 /// JitInstructionSet
 /// @brief Subclass of InstructionSet that allows users to override
@@ -136,7 +135,7 @@ private:
 
 
 
-struct JitLLVMContext : LLVMContext
+struct JitLLVMContext : llvm::LLVMContext
 {
 };
 
@@ -150,32 +149,32 @@ struct JitManager
     ~JitManager(){};
 
     JitLLVMContext          mContext;   ///< LLVM compiler
-    IRBuilder<>             mBuilder;   ///< LLVM IR Builder
-    ExecutionEngine*        mpExec;
+    llvm::IRBuilder<>       mBuilder;   ///< LLVM IR Builder
+    llvm::ExecutionEngine*  mpExec;
 
     // Need to be rebuilt after a JIT and before building new IR
-    Module* mpCurrentModule;
+    llvm::Module* mpCurrentModule;
     bool mIsModuleFinalized;
     uint32_t mJitNumber;
 
     uint32_t                 mVWidth;
 
     // Built in types.
-    Type*                mInt8Ty;
-    Type*                mInt32Ty;
-    Type*                mInt64Ty;
-    Type*                mFP32Ty;
-    StructType*          mV4FP32Ty;
-    StructType*          mV4Int32Ty;
+    llvm::Type*                mInt8Ty;
+    llvm::Type*                mInt32Ty;
+    llvm::Type*                mInt64Ty;
+    llvm::Type*                mFP32Ty;
+    llvm::StructType*          mV4FP32Ty;
+    llvm::StructType*          mV4Int32Ty;
 
-    Type* mSimtFP32Ty;
-    Type* mSimtInt32Ty;
+    llvm::Type* mSimtFP32Ty;
+    llvm::Type* mSimtInt32Ty;
 
-    Type* mSimdVectorInt32Ty;
-    Type* mSimdVectorTy;
+    llvm::Type* mSimdVectorInt32Ty;
+    llvm::Type* mSimdVectorTy;
 
     // fetch shader types
-    FunctionType*        mFetchShaderTy;
+    llvm::FunctionType*        mFetchShaderTy;
 
     JitInstructionSet mArch;
     std::string mCore;
@@ -183,6 +182,6 @@ struct JitManager
     void SetupNewModule();
     bool SetupModuleFromIR(const uint8_t *pIR);
 
-    void DumpAsm(Function* pFunction, const char* fileName);
-    static void DumpToFile(Function *f, const char *fileName);
+    void DumpAsm(llvm::Function* pFunction, const char* fileName);
+    static void DumpToFile(llvm::Function *f, const char *fileName);
 };
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
index 940399c2020..1452d27675a 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -37,6 +37,9 @@
 // components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
 #define QUANTIZE_THRESHOLD 2
 
+using namespace llvm;
+using namespace SwrJit;
+
 //////////////////////////////////////////////////////////////////////////
 /// Interface to Jitting a blend shader
 //////////////////////////////////////////////////////////////////////////
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index 01468c48eed..6ee4d857326 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -30,49 +30,53 @@
 
 #include "builder.h"
 
-using namespace llvm;
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Contructor for Builder.
-/// @param pJitMgr - JitManager which contains modules, function passes, etc.
-Builder::Builder(JitManager *pJitMgr)
-    : mpJitMgr(pJitMgr)
+namespace SwrJit
 {
-    mVWidth = pJitMgr->mVWidth;
+    using namespace llvm;
 
-    mpIRBuilder = &pJitMgr->mBuilder;
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Contructor for Builder.
+    /// @param pJitMgr - JitManager which contains modules, function passes, etc.
+    Builder::Builder(JitManager *pJitMgr)
+        : mpJitMgr(pJitMgr)
+    {
+        mVWidth = pJitMgr->mVWidth;
 
-    mVoidTy = Type::getVoidTy(pJitMgr->mContext);
-    mFP16Ty = Type::getHalfTy(pJitMgr->mContext);
-    mFP32Ty = Type::getFloatTy(pJitMgr->mContext);
-    mDoubleTy = Type::getDoubleTy(pJitMgr->mContext);
-    mInt1Ty = Type::getInt1Ty(pJitMgr->mContext);
-    mInt8Ty = Type::getInt8Ty(pJitMgr->mContext);
-    mInt16Ty = Type::getInt16Ty(pJitMgr->mContext);
-    mInt32Ty = Type::getInt32Ty(pJitMgr->mContext);
-    mInt8PtrTy = PointerType::get(mInt8Ty, 0);
-    mInt16PtrTy = PointerType::get(mInt16Ty, 0);
-    mInt32PtrTy = PointerType::get(mInt32Ty, 0);
-    mInt64Ty = Type::getInt64Ty(pJitMgr->mContext);
-    mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure)
-    mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type
-    mSimdInt1Ty = VectorType::get(mInt1Ty, mVWidth);
-    mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth);
-    mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth);
-    mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth);
-    mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth);
-    mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth);
-    mSimdVectorTy = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mSimdFP32Ty), false);
+        mpIRBuilder = &pJitMgr->mBuilder;
 
-    if (sizeof(uint32_t*) == 4)
-    {
-        mIntPtrTy = mInt32Ty;
-        mSimdIntPtrTy = mSimdInt32Ty;
-    }
-    else
-    {
-        SWR_ASSERT(sizeof(uint32_t*) == 8);
-        mIntPtrTy = mInt64Ty;
-        mSimdIntPtrTy = mSimdInt64Ty;
+        mVoidTy = Type::getVoidTy(pJitMgr->mContext);
+        mFP16Ty = Type::getHalfTy(pJitMgr->mContext);
+        mFP32Ty = Type::getFloatTy(pJitMgr->mContext);
+        mDoubleTy = Type::getDoubleTy(pJitMgr->mContext);
+        mInt1Ty = Type::getInt1Ty(pJitMgr->mContext);
+        mInt8Ty = Type::getInt8Ty(pJitMgr->mContext);
+        mInt16Ty = Type::getInt16Ty(pJitMgr->mContext);
+        mInt32Ty = Type::getInt32Ty(pJitMgr->mContext);
+        mInt8PtrTy = PointerType::get(mInt8Ty, 0);
+        mInt16PtrTy = PointerType::get(mInt16Ty, 0);
+        mInt32PtrTy = PointerType::get(mInt32Ty, 0);
+        mInt64Ty = Type::getInt64Ty(pJitMgr->mContext);
+        mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure)
+        mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type
+        mSimdInt1Ty = VectorType::get(mInt1Ty, mVWidth);
+        mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth);
+        mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth);
+        mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth);
+        mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth);
+        mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth);
+        mSimdVectorTy = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mSimdFP32Ty), false);
+        mSimdVectorTRTy = StructType::get(pJitMgr->mContext, std::vector<Type*>(5, mSimdFP32Ty), false);
+
+        if (sizeof(uint32_t*) == 4)
+        {
+            mIntPtrTy = mInt32Ty;
+            mSimdIntPtrTy = mSimdInt32Ty;
+        }
+        else
+        {
+            SWR_ASSERT(sizeof(uint32_t*) == 8);
+            mIntPtrTy = mInt64Ty;
+            mSimdIntPtrTy = mSimdInt64Ty;
+        }
     }
 }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index ddc32f432c9..515560e0597 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -32,47 +32,49 @@
 #include "JitManager.h"
 #include "common/formats.h"
 
-using namespace llvm;
-
-struct Builder
+namespace SwrJit
 {
-    Builder(JitManager *pJitMgr);
-    IRBuilder<>* IRB() { return mpIRBuilder; };
-    JitManager* JM() { return mpJitMgr; }
+    using namespace llvm;
+    struct Builder
+    {
+        Builder(JitManager *pJitMgr);
+        IRBuilder<>* IRB() { return mpIRBuilder; };
+        JitManager* JM() { return mpJitMgr; }
 
-    JitManager* mpJitMgr;
-    IRBuilder<>* mpIRBuilder;
+        JitManager* mpJitMgr;
+        IRBuilder<>* mpIRBuilder;
 
-    uint32_t             mVWidth;
+        uint32_t             mVWidth;
 
-    // Built in types.
-    Type*                mVoidTy;
-    Type*                mInt1Ty;
-    Type*                mInt8Ty;
-    Type*                mInt16Ty;
-    Type*                mInt32Ty;
-    Type*                mInt64Ty;
-    Type*                mIntPtrTy;
-    Type*                mFP16Ty;
-    Type*                mFP32Ty;
-    Type*                mDoubleTy;
-    Type*                mInt8PtrTy;
-    Type*                mInt16PtrTy;
-    Type*                mInt32PtrTy;
-    Type*                mSimdFP16Ty;
-    Type*                mSimdFP32Ty;
-    Type*                mSimdInt1Ty;
-    Type*                mSimdInt16Ty;
-    Type*                mSimdInt32Ty;
-    Type*                mSimdInt64Ty;
-    Type*                mSimdIntPtrTy;
-    Type*                mSimdVectorTy;
-    StructType*          mV4FP32Ty;
-    StructType*          mV4Int32Ty;
+        // Built in types.
+        Type*                mVoidTy;
+        Type*                mInt1Ty;
+        Type*                mInt8Ty;
+        Type*                mInt16Ty;
+        Type*                mInt32Ty;
+        Type*                mInt64Ty;
+        Type*                mIntPtrTy;
+        Type*                mFP16Ty;
+        Type*                mFP32Ty;
+        Type*                mDoubleTy;
+        Type*                mInt8PtrTy;
+        Type*                mInt16PtrTy;
+        Type*                mInt32PtrTy;
+        Type*                mSimdFP16Ty;
+        Type*                mSimdFP32Ty;
+        Type*                mSimdInt1Ty;
+        Type*                mSimdInt16Ty;
+        Type*                mSimdInt32Ty;
+        Type*                mSimdInt64Ty;
+        Type*                mSimdIntPtrTy;
+        Type*                mSimdVectorTy;
+        Type*                mSimdVectorTRTy;
+        StructType*          mV4FP32Ty;
+        StructType*          mV4Int32Ty;
 
 #include "builder_gen.h"
 #include "builder_x86.h"
 #include "builder_misc.h"
 #include "builder_math.h"
-
-};
+    };
+}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 13c1daf6fe1..d755cc391a0 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -30,962 +30,1051 @@
 #include "builder.h"
 #include "common/rdtsc_buckets.h"
 
-void __cdecl CallPrint(const char* fmt, ...);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Convert an IEEE 754 32-bit single precision float to an
-///        16 bit float with 5 exponent bits and a variable
-///        number of mantissa bits.
-/// @param val - 32-bit float
-/// @todo Maybe move this outside of this file into a header?
-static uint16_t Convert32To16Float(float val)
-{
-    uint32_t sign, exp, mant;
-    uint32_t roundBits;
 
-    // Extract the sign, exponent, and mantissa
-    uint32_t uf = *(uint32_t*)&val;
-    sign = (uf & 0x80000000) >> 31;
-    exp = (uf & 0x7F800000) >> 23;
-    mant = uf & 0x007FFFFF;
+namespace SwrJit
+{
+    void __cdecl CallPrint(const char* fmt, ...);
 
-    // Check for out of range
-    if (std::isnan(val))
-    {
-        exp = 0x1F;
-        mant = 0x200;
-        sign = 1;                     // set the sign bit for NANs
-    }
-    else if (std::isinf(val))
-    {
-        exp = 0x1f;
-        mant = 0x0;
-    }
-    else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
-    {
-        exp = 0x1E;
-        mant = 0x3FF;
-    }
-    else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Convert an IEEE 754 32-bit single precision float to an
+    ///        16 bit float with 5 exponent bits and a variable
+    ///        number of mantissa bits.
+    /// @param val - 32-bit float
+    /// @todo Maybe move this outside of this file into a header?
+    static uint16_t Convert32To16Float(float val)
     {
-        mant |= 0x00800000;
-        for (; exp <= 0x70; mant >>= 1, exp++)
-            ;
-        exp = 0;
-        mant = mant >> 13;
-    }
-    else if (exp < 0x66) // Too small to represent -> Zero
-    {
-        exp = 0;
-        mant = 0;
-    }
-    else
-    {
-        // Saves bits that will be shifted off for rounding
-        roundBits = mant & 0x1FFFu;
-        // convert exponent and mantissa to 16 bit format
-        exp = exp - 0x70;
-        mant = mant >> 13;
+        uint32_t sign, exp, mant;
+        uint32_t roundBits;
 
-        // Essentially RTZ, but round up if off by only 1 lsb
-        if (roundBits == 0x1FFFu)
+        // Extract the sign, exponent, and mantissa
+        uint32_t uf = *(uint32_t*)&val;
+        sign = (uf & 0x80000000) >> 31;
+        exp = (uf & 0x7F800000) >> 23;
+        mant = uf & 0x007FFFFF;
+
+        // Check for out of range
+        if (std::isnan(val))
         {
-            mant++;
-            // check for overflow
-            if ((mant & 0xC00u) != 0)
-                exp++;
-            // make sure only the needed bits are used
-            mant &= 0x3FF;
+            exp = 0x1F;
+            mant = 0x200;
+            sign = 1;                     // set the sign bit for NANs
+        }
+        else if (std::isinf(val))
+        {
+            exp = 0x1f;
+            mant = 0x0;
+        }
+        else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
+        {
+            exp = 0x1E;
+            mant = 0x3FF;
+        }
+        else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
+        {
+            mant |= 0x00800000;
+            for (; exp <= 0x70; mant >>= 1, exp++)
+                ;
+            exp = 0;
+            mant = mant >> 13;
+        }
+        else if (exp < 0x66) // Too small to represent -> Zero
+        {
+            exp = 0;
+            mant = 0;
+        }
+        else
+        {
+            // Saves bits that will be shifted off for rounding
+            roundBits = mant & 0x1FFFu;
+            // convert exponent and mantissa to 16 bit format
+            exp = exp - 0x70;
+            mant = mant >> 13;
+
+            // Essentially RTZ, but round up if off by only 1 lsb
+            if (roundBits == 0x1FFFu)
+            {
+                mant++;
+                // check for overflow
+                if ((mant & 0xC00u) != 0)
+                    exp++;
+                // make sure only the needed bits are used
+                mant &= 0x3FF;
+            }
         }
-    }
-
-    uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
-    return (uint16_t)tmpVal;
-}
 
-//////////////////////////////////////////////////////////////////////////
-/// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
-///        float
-/// @param val - 16-bit float
-/// @todo Maybe move this outside of this file into a header?
-static float ConvertSmallFloatTo32(UINT val)
-{
-    UINT result;
-    if ((val & 0x7fff) == 0)
-    {
-        result = ((uint32_t)(val & 0x8000)) << 16;
-    }
-    else if ((val & 0x7c00) == 0x7c00)
-    {
-        result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
-        result |= ((uint32_t)val & 0x8000) << 16;
+        uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
+        return (uint16_t)tmpVal;
     }
-    else
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
+    ///        float
+    /// @param val - 16-bit float
+    /// @todo Maybe move this outside of this file into a header?
+    static float ConvertSmallFloatTo32(UINT val)
     {
-        uint32_t sign = (val & 0x8000) << 16;
-        uint32_t mant = (val & 0x3ff) << 13;
-        uint32_t exp = (val >> 10) & 0x1f;
-        if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
+        UINT result;
+        if ((val & 0x7fff) == 0)
+        {
+            result = ((uint32_t)(val & 0x8000)) << 16;
+        }
+        else if ((val & 0x7c00) == 0x7c00)
+        {
+            result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
+            result |= ((uint32_t)val & 0x8000) << 16;
+        }
+        else
         {
-            mant <<= 1;
-            while (mant < (0x400 << 13))
+            uint32_t sign = (val & 0x8000) << 16;
+            uint32_t mant = (val & 0x3ff) << 13;
+            uint32_t exp = (val >> 10) & 0x1f;
+            if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
             {
-                exp--;
                 mant <<= 1;
+                while (mant < (0x400 << 13))
+                {
+                    exp--;
+                    mant <<= 1;
+                }
+                mant &= (0x3ff << 13);
             }
-            mant &= (0x3ff << 13);
+            exp = ((exp - 15 + 127) & 0xff) << 23;
+            result = sign | exp | mant;
         }
-        exp = ((exp - 15 + 127) & 0xff) << 23;
-        result = sign | exp | mant;
-    }
 
-    return *(float*)&result;
-}
+        return *(float*)&result;
+    }
 
-Constant *Builder::C(bool i)
-{
-    return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
-}
+    Constant *Builder::C(bool i)
+    {
+        return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
+    }
 
-Constant *Builder::C(char i)
-{
-    return ConstantInt::get(IRB()->getInt8Ty(), i);
-}
+    Constant *Builder::C(char i)
+    {
+        return ConstantInt::get(IRB()->getInt8Ty(), i);
+    }
 
-Constant *Builder::C(uint8_t i)
-{
-    return ConstantInt::get(IRB()->getInt8Ty(), i);
-}
+    Constant *Builder::C(uint8_t i)
+    {
+        return ConstantInt::get(IRB()->getInt8Ty(), i);
+    }
 
-Constant *Builder::C(int i)
-{
-    return ConstantInt::get(IRB()->getInt32Ty(), i);
-}
+    Constant *Builder::C(int i)
+    {
+        return ConstantInt::get(IRB()->getInt32Ty(), i);
+    }
 
-Constant *Builder::C(int64_t i)
-{
-    return ConstantInt::get(IRB()->getInt64Ty(), i);
-}
+    Constant *Builder::C(int64_t i)
+    {
+        return ConstantInt::get(IRB()->getInt64Ty(), i);
+    }
 
-Constant *Builder::C(uint16_t i)
-{
-    return ConstantInt::get(mInt16Ty,i);
-}
+    Constant *Builder::C(uint16_t i)
+    {
+        return ConstantInt::get(mInt16Ty,i);
+    }
 
-Constant *Builder::C(uint32_t i)
-{
-    return ConstantInt::get(IRB()->getInt32Ty(), i);
-}
+    Constant *Builder::C(uint32_t i)
+    {
+        return ConstantInt::get(IRB()->getInt32Ty(), i);
+    }
 
-Constant *Builder::C(float i)
-{
-    return ConstantFP::get(IRB()->getFloatTy(), i);
-}
+    Constant *Builder::C(float i)
+    {
+        return ConstantFP::get(IRB()->getFloatTy(), i);
+    }
 
-Constant *Builder::PRED(bool pred)
-{
-    return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
-}
+    Constant *Builder::PRED(bool pred)
+    {
+        return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
+    }
 
-Value *Builder::VIMMED1(int i)
-{
-    return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
-}
+    Value *Builder::VIMMED1(int i)
+    {
+        return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
+    }
 
-Value *Builder::VIMMED1(uint32_t i)
-{
-    return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
-}
+    Value *Builder::VIMMED1(uint32_t i)
+    {
+        return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
+    }
 
-Value *Builder::VIMMED1(float i)
-{
-    return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
-}
+    Value *Builder::VIMMED1(float i)
+    {
+        return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
+    }
 
-Value *Builder::VIMMED1(bool i)
-{
-    return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
-}
+    Value *Builder::VIMMED1(bool i)
+    {
+        return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
+    }
 
-Value *Builder::VUNDEF_IPTR()
-{
-    return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
-}
+    Value *Builder::VUNDEF_IPTR()
+    {
+        return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
+    }
 
-Value *Builder::VUNDEF_I()
-{
-    return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
-}
+    Value *Builder::VUNDEF_I()
+    {
+        return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
+    }
 
-Value *Builder::VUNDEF(Type *ty, uint32_t size)
-{
-    return UndefValue::get(VectorType::get(ty, size));
-}
+    Value *Builder::VUNDEF(Type *ty, uint32_t size)
+    {
+        return UndefValue::get(VectorType::get(ty, size));
+    }
 
-Value *Builder::VUNDEF_F()
-{
-    return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
-}
+    Value *Builder::VUNDEF_F()
+    {
+        return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
+    }
 
-Value *Builder::VUNDEF(Type* t)
-{
-    return UndefValue::get(VectorType::get(t, mVWidth));
-}
+    Value *Builder::VUNDEF(Type* t)
+    {
+        return UndefValue::get(VectorType::get(t, mVWidth));
+    }
 
-#if HAVE_LLVM == 0x306
-Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
-{
-    return VINSERT(vec, val, C((int64_t)index));
-}
-#endif
+    #if HAVE_LLVM == 0x306
+    Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
+    {
+        return VINSERT(vec, val, C((int64_t)index));
+    }
+    #endif
 
-Value *Builder::VBROADCAST(Value *src)
-{
-    // check if src is already a vector
-    if (src->getType()->isVectorTy())
+    Value *Builder::VBROADCAST(Value *src)
     {
-        return src;
+        // check if src is already a vector
+        if (src->getType()->isVectorTy())
+        {
+            return src;
+        }
+
+        return VECTOR_SPLAT(mVWidth, src);
     }
 
-    return VECTOR_SPLAT(mVWidth, src);
-}
+    uint32_t Builder::IMMED(Value* v)
+    {
+        SWR_ASSERT(isa<ConstantInt>(v));
+        ConstantInt *pValConst = cast<ConstantInt>(v);
+        return pValConst->getZExtValue();
+    }
 
-uint32_t Builder::IMMED(Value* v)
-{
-    SWR_ASSERT(isa<ConstantInt>(v));
-    ConstantInt *pValConst = cast<ConstantInt>(v);
-    return pValConst->getZExtValue();
-}
+    int32_t Builder::S_IMMED(Value* v)
+    {
+        SWR_ASSERT(isa<ConstantInt>(v));
+        ConstantInt *pValConst = cast<ConstantInt>(v);
+        return pValConst->getSExtValue();
+    }
 
-int32_t Builder::S_IMMED(Value* v)
-{
-    SWR_ASSERT(isa<ConstantInt>(v));
-    ConstantInt *pValConst = cast<ConstantInt>(v);
-    return pValConst->getSExtValue();
-}
+    Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
+    {
+        std::vector<Value*> indices;
+        for (auto i : indexList)
+            indices.push_back(i);
+        return GEPA(ptr, indices);
+    }
 
-Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
-{
-    std::vector<Value*> indices;
-    for (auto i : indexList)
-        indices.push_back(i);
-    return GEPA(ptr, indices);
-}
+    Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
+    {
+        std::vector<Value*> indices;
+        for (auto i : indexList)
+            indices.push_back(C(i));
+        return GEPA(ptr, indices);
+    }
 
-Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
-{
-    std::vector<Value*> indices;
-    for (auto i : indexList)
-        indices.push_back(C(i));
-    return GEPA(ptr, indices);
-}
+    LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
+    {
+        std::vector<Value*> valIndices;
+        for (auto i : indices)
+            valIndices.push_back(C(i));
+        return LOAD(GEPA(basePtr, valIndices), name);
+    }
 
-LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
-{
-    std::vector<Value*> valIndices;
-    for (auto i : indices)
-        valIndices.push_back(C(i));
-    return LOAD(GEPA(basePtr, valIndices), name);
-}
+    LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
+    {
+        std::vector<Value*> valIndices;
+        for (auto i : indices)
+            valIndices.push_back(i);
+        return LOAD(GEPA(basePtr, valIndices), name);
+    }
 
-LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
-{
-    std::vector<Value*> valIndices;
-    for (auto i : indices)
-        valIndices.push_back(i);
-    return LOAD(GEPA(basePtr, valIndices), name);
-}
+    StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
+    {
+        std::vector<Value*> valIndices;
+        for (auto i : indices)
+            valIndices.push_back(C(i));
+        return STORE(val, GEPA(basePtr, valIndices));
+    }
 
-StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
-{
-    std::vector<Value*> valIndices;
-    for (auto i : indices)
-        valIndices.push_back(C(i));
-    return STORE(val, GEPA(basePtr, valIndices));
-}
+    StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
+    {
+        std::vector<Value*> valIndices;
+        for (auto i : indices)
+            valIndices.push_back(i);
+        return STORE(val, GEPA(basePtr, valIndices));
+    }
 
-StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
-{
-    std::vector<Value*> valIndices;
-    for (auto i : indices)
-        valIndices.push_back(i);
-    return STORE(val, GEPA(basePtr, valIndices));
-}
+    CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
+    {
+        std::vector<Value*> args;
+        for (auto arg : argsList)
+            args.push_back(arg);
+        return CALLA(Callee, args);
+    }
 
-CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
-{
-    std::vector<Value*> args;
-    for (auto arg : argsList)
+    #if HAVE_LLVM > 0x306
+    CallInst *Builder::CALL(Value *Callee, Value* arg)
+    {
+        std::vector<Value*> args;
         args.push_back(arg);
-    return CALLA(Callee, args);
-}
-
-#if HAVE_LLVM > 0x306
-CallInst *Builder::CALL(Value *Callee, Value* arg)
-{
-    std::vector<Value*> args;
-    args.push_back(arg);
-    return CALLA(Callee, args);
-}
+        return CALLA(Callee, args);
+    }
 
-CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
-{
-    std::vector<Value*> args;
-    args.push_back(arg1);
-    args.push_back(arg2);
-    return CALLA(Callee, args);
-}
+    CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
+    {
+        std::vector<Value*> args;
+        args.push_back(arg1);
+        args.push_back(arg2);
+        return CALLA(Callee, args);
+    }
 
-CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
-{
-    std::vector<Value*> args;
-    args.push_back(arg1);
-    args.push_back(arg2);
-    args.push_back(arg3);
-    return CALLA(Callee, args);
-}
-#endif
-
-Value *Builder::VRCP(Value *va)
-{
-    return FDIV(VIMMED1(1.0f), va);  // 1 / a
-}
+    CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
+    {
+        std::vector<Value*> args;
+        args.push_back(arg1);
+        args.push_back(arg2);
+        args.push_back(arg3);
+        return CALLA(Callee, args);
+    }
+    #endif
 
-Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
-{
-    Value* vOut = FMADDPS(vA, vX, vC);
-    vOut = FMADDPS(vB, vY, vOut);
-    return vOut;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Generate an i32 masked load operation in LLVM IR.  If not  
-/// supported on the underlying platform, emulate it with float masked load
-/// @param src - base address pointer for the load
-/// @param vMask - SIMD wide mask that controls whether to access memory load 0
-Value *Builder::MASKLOADD(Value* src,Value* mask)
-{
-    Value* vResult;
-    // use avx2 gather instruction is available
-    if(JM()->mArch.AVX2())
-    {
-        Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
-        vResult = CALL(func,{src,mask});
-    }
-    else
-    {
-        // maskload intrinsic expects integer mask operand in llvm >= 3.8
-#if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
-        mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
-#else
-        mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
-#endif
-        Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
-        vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
-    }
-    return vResult;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief insert a JIT call to CallPrint
-/// - outputs formatted string to both stdout and VS output window
-/// - DEBUG builds only
-/// Usage example:
-///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
-///   where C(lane) creates a constant value to print, and pIndex is the Value*
-///   result from a GEP, printing out the pointer to memory
-/// @param printStr - constant string to print, which includes format specifiers
-/// @param printArgs - initializer list of Value*'s to print to std out
-CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
-{
-    // push the arguments to CallPrint into a vector
-    std::vector<Value*> printCallArgs;
-    // save room for the format string.  we still need to modify it for vectors
-    printCallArgs.resize(1);
+    Value *Builder::VRCP(Value *va)
+    {
+        return FDIV(VIMMED1(1.0f), va);  // 1 / a
+    }
 
-    // search through the format string for special processing
-    size_t pos = 0;
-    std::string tempStr(printStr);
-    pos = tempStr.find('%', pos);
-    auto v = printArgs.begin();
+    Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
+    {
+        Value* vOut = FMADDPS(vA, vX, vC);
+        vOut = FMADDPS(vB, vY, vOut);
+        return vOut;
+    }
 
-    while ((pos != std::string::npos) && (v != printArgs.end()))
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate an i32 masked load operation in LLVM IR.  If not  
+    /// supported on the underlying platform, emulate it with float masked load
+    /// @param src - base address pointer for the load
+    /// @param vMask - SIMD wide mask that controls whether to access memory load 0
+    Value *Builder::MASKLOADD(Value* src,Value* mask)
     {
-        Value* pArg = *v;
-        Type* pType = pArg->getType();
+        Value* vResult;
+        // use avx2 gather instruction is available
+        if(JM()->mArch.AVX2())
+        {
+            Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
+            vResult = CALL(func,{src,mask});
+        }
+        else
+        {
+            // maskload intrinsic expects integer mask operand in llvm >= 3.8
+    #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
+            mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
+    #else
+            mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
+    #endif
+            Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
+            vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
+        }
+        return vResult;
+    }
 
-        if (pType->isVectorTy())
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief insert a JIT call to CallPrint
+    /// - outputs formatted string to both stdout and VS output window
+    /// - DEBUG builds only
+    /// Usage example:
+    ///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
+    ///   where C(lane) creates a constant value to print, and pIndex is the Value*
+    ///   result from a GEP, printing out the pointer to memory
+    /// @param printStr - constant string to print, which includes format specifiers
+    /// @param printArgs - initializer list of Value*'s to print to std out
+    CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
+    {
+        // push the arguments to CallPrint into a vector
+        std::vector<Value*> printCallArgs;
+        // save room for the format string.  we still need to modify it for vectors
+        printCallArgs.resize(1);
+
+        // search through the format string for special processing
+        size_t pos = 0;
+        std::string tempStr(printStr);
+        pos = tempStr.find('%', pos);
+        auto v = printArgs.begin();
+
+        while ((pos != std::string::npos) && (v != printArgs.end()))
         {
-            Type* pContainedType = pType->getContainedType(0);
+            Value* pArg = *v;
+            Type* pType = pArg->getType();
 
-            if (toupper(tempStr[pos + 1]) == 'X')
+            if (pType->isVectorTy())
             {
-                tempStr[pos] = '0';
-                tempStr[pos + 1] = 'x';
-                tempStr.insert(pos + 2, "%08X ");
-                pos += 7;
-
-                printCallArgs.push_back(VEXTRACT(pArg, C(0)));
+                Type* pContainedType = pType->getContainedType(0);
 
-                std::string vectorFormatStr;
-                for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
+                if (toupper(tempStr[pos + 1]) == 'X')
                 {
-                    vectorFormatStr += "0x%08X ";
-                    printCallArgs.push_back(VEXTRACT(pArg, C(i)));
+                    tempStr[pos] = '0';
+                    tempStr[pos + 1] = 'x';
+                    tempStr.insert(pos + 2, "%08X ");
+                    pos += 7;
+
+                    printCallArgs.push_back(VEXTRACT(pArg, C(0)));
+
+                    std::string vectorFormatStr;
+                    for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
+                    {
+                        vectorFormatStr += "0x%08X ";
+                        printCallArgs.push_back(VEXTRACT(pArg, C(i)));
+                    }
+
+                    tempStr.insert(pos, vectorFormatStr);
+                    pos += vectorFormatStr.size();
                 }
-
-                tempStr.insert(pos, vectorFormatStr);
-                pos += vectorFormatStr.size();
-            }
-            else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
-            {
-                uint32_t i = 0;
-                for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
+                else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
                 {
-                    tempStr.insert(pos, std::string("%f "));
-                    pos += 3;
+                    uint32_t i = 0;
+                    for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
+                    {
+                        tempStr.insert(pos, std::string("%f "));
+                        pos += 3;
+                        printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
+                    }
                     printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
                 }
-                printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
+                else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
+                {
+                    uint32_t i = 0;
+                    for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
+                    {
+                        tempStr.insert(pos, std::string("%d "));
+                        pos += 3;
+                        printCallArgs.push_back(VEXTRACT(pArg, C(i)));
+                    }
+                    printCallArgs.push_back(VEXTRACT(pArg, C(i)));
+                }
             }
-            else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
+            else
             {
-                uint32_t i = 0;
-                for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
+                if (toupper(tempStr[pos + 1]) == 'X')
                 {
-                    tempStr.insert(pos, std::string("%d "));
+                    tempStr[pos] = '0';
+                    tempStr.insert(pos + 1, "x%08");
+                    printCallArgs.push_back(pArg);
                     pos += 3;
-                    printCallArgs.push_back(VEXTRACT(pArg, C(i)));
                 }
-                printCallArgs.push_back(VEXTRACT(pArg, C(i)));
+                // for %f we need to cast float Values to doubles so that they print out correctly
+                else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
+                {
+                    printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
+                    pos++;
+                }
+                else
+                {
+                    printCallArgs.push_back(pArg);
+                }
             }
+
+            // advance to the next arguement
+            v++;
+            pos = tempStr.find('%', ++pos);
         }
-        else
+
+        // create global variable constant string
+        Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
+        GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
+        JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
+
+        // get a pointer to the first character in the constant string array
+        std::vector<Constant*> geplist{C(0),C(0)};
+    #if HAVE_LLVM == 0x306
+        Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
+    #else
+        Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
+    #endif
+
+        // insert the pointer to the format string in the argument vector
+        printCallArgs[0] = strGEP;
+
+        // get pointer to CallPrint function and insert decl into the module if needed
+        std::vector<Type*> args;
+        args.push_back(PointerType::get(mInt8Ty,0));
+        FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
+        Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
+
+        // if we haven't yet added the symbol to the symbol table
+        if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
         {
-            if (toupper(tempStr[pos + 1]) == 'X')
-            {
-                tempStr[pos] = '0';
-                tempStr.insert(pos + 1, "x%08");
-                printCallArgs.push_back(pArg);
-                pos += 3;
-            }
-            // for %f we need to cast float Values to doubles so that they print out correctly
-            else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
-            {
-                printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
-                pos++;
-            }
-            else
-            {
-                printCallArgs.push_back(pArg);
-            }
+            sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
         }
 
-        // advance to the next arguement
-        v++;
-        pos = tempStr.find('%', ++pos);
+        // insert a call to CallPrint
+        return CALLA(callPrintFn,printCallArgs);
     }
 
-    // create global variable constant string
-    Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
-    GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
-    JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
-
-    // get a pointer to the first character in the constant string array
-    std::vector<Constant*> geplist{C(0),C(0)};
-#if HAVE_LLVM == 0x306
-    Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
-#else
-    Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
-#endif
-
-    // insert the pointer to the format string in the argument vector
-    printCallArgs[0] = strGEP;
-
-    // get pointer to CallPrint function and insert decl into the module if needed
-    std::vector<Type*> args;
-    args.push_back(PointerType::get(mInt8Ty,0));
-    FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
-    Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
-
-    // if we haven't yet added the symbol to the symbol table
-    if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Wrapper around PRINT with initializer list.
+    CallInst* Builder::PRINT(const std::string &printStr)
     {
-        sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
+        return PRINT(printStr, {});
     }
 
-    // insert a call to CallPrint
-    return CALLA(callPrintFn,printCallArgs);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Wrapper around PRINT with initializer list.
-CallInst* Builder::PRINT(const std::string &printStr)
-{
-    return PRINT(printStr, {});
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Generate a masked gather operation in LLVM IR.  If not  
-/// supported on the underlying platform, emulate it with loads
-/// @param vSrc - SIMD wide value that will be loaded if mask is invalid
-/// @param pBase - Int8* base VB address pointer value
-/// @param vIndices - SIMD wide value of VB byte offsets
-/// @param vMask - SIMD wide mask that controls whether to access memory or the src values
-/// @param scale - value to scale indices by
-Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
-{
-    Value* vGather;
-
-    // use avx2 gather instruction if available
-    if(JM()->mArch.AVX2())
-    {
-        // force mask to <N x float>, required by vgather
-        vMask = BITCAST(vMask, mSimdFP32Ty);
-        vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
-    }
-    else
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate a masked gather operation in LLVM IR.  If not  
+    /// supported on the underlying platform, emulate it with loads
+    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+    /// @param pBase - Int8* base VB address pointer value
+    /// @param vIndices - SIMD wide value of VB byte offsets
+    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
+    /// @param scale - value to scale indices by
+    Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
     {
-        Value* pStack = STACKSAVE();
+        Value* vGather;
 
-        // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
-        Value* vSrcPtr = ALLOCA(vSrc->getType());
-        STORE(vSrc, vSrcPtr);
-
-        vGather = VUNDEF_F();
-        Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
-        Value *vOffsets = MUL(vIndices,vScaleVec);
-        Value *mask = MASK(vMask);
-        for(uint32_t i = 0; i < mVWidth; ++i)
+        // use avx2 gather instruction if available
+        if(JM()->mArch.AVX2())
         {
-            // single component byte index
-            Value *offset = VEXTRACT(vOffsets,C(i));
-            // byte pointer to component
-            Value *loadAddress = GEP(pBase,offset);
-            loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
-            // pointer to the value to load if we're masking off a component
-            Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
-            Value *selMask = VEXTRACT(mask,C(i));
-            // switch in a safe address to load if we're trying to access a vertex 
-            Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
-            Value *val = LOAD(validAddress);
-            vGather = VINSERT(vGather,val,C(i));
+            // force mask to <N x float>, required by vgather
+            vMask = BITCAST(vMask, mSimdFP32Ty);
+            vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
         }
-        STACKRESTORE(pStack);
-    }
+        else
+        {
+            Value* pStack = STACKSAVE();
 
-    return vGather;
-}
+            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
+            Value* vSrcPtr = ALLOCA(vSrc->getType());
+            STORE(vSrc, vSrcPtr);
 
-//////////////////////////////////////////////////////////////////////////
-/// @brief Generate a masked gather operation in LLVM IR.  If not  
-/// supported on the underlying platform, emulate it with loads
-/// @param vSrc - SIMD wide value that will be loaded if mask is invalid
-/// @param pBase - Int8* base VB address pointer value
-/// @param vIndices - SIMD wide value of VB byte offsets
-/// @param vMask - SIMD wide mask that controls whether to access memory or the src values
-/// @param scale - value to scale indices by
-Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
-{
-    Value* vGather;
+            vGather = VUNDEF_F();
+            Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
+            Value *vOffsets = MUL(vIndices,vScaleVec);
+            Value *mask = MASK(vMask);
+            for(uint32_t i = 0; i < mVWidth; ++i)
+            {
+                // single component byte index
+                Value *offset = VEXTRACT(vOffsets,C(i));
+                // byte pointer to component
+                Value *loadAddress = GEP(pBase,offset);
+                loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
+                // pointer to the value to load if we're masking off a component
+                Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
+                Value *selMask = VEXTRACT(mask,C(i));
+                // switch in a safe address to load if we're trying to access a vertex 
+                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
+                Value *val = LOAD(validAddress);
+                vGather = VINSERT(vGather,val,C(i));
+            }
+            STACKRESTORE(pStack);
+        }
 
-    // use avx2 gather instruction if available
-    if(JM()->mArch.AVX2())
-    {
-        vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
+        return vGather;
     }
-    else
-    {
-        Value* pStack = STACKSAVE();
 
-        // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
-        Value* vSrcPtr = ALLOCA(vSrc->getType());
-        STORE(vSrc, vSrcPtr);
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate a masked gather operation in LLVM IR.  If not  
+    /// supported on the underlying platform, emulate it with loads
+    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+    /// @param pBase - Int8* base VB address pointer value
+    /// @param vIndices - SIMD wide value of VB byte offsets
+    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
+    /// @param scale - value to scale indices by
+    Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
+    {
+        Value* vGather;
 
-        vGather = VUNDEF_I();
-        Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
-        Value *vOffsets = MUL(vIndices, vScaleVec);
-        Value *mask = MASK(vMask);
-        for(uint32_t i = 0; i < mVWidth; ++i)
+        // use avx2 gather instruction if available
+        if(JM()->mArch.AVX2())
         {
-            // single component byte index
-            Value *offset = VEXTRACT(vOffsets, C(i));
-            // byte pointer to component
-            Value *loadAddress = GEP(pBase, offset);
-            loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
-            // pointer to the value to load if we're masking off a component
-            Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
-            Value *selMask = VEXTRACT(mask, C(i));
-            // switch in a safe address to load if we're trying to access a vertex 
-            Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
-            Value *val = LOAD(validAddress, C(0));
-            vGather = VINSERT(vGather, val, C(i));
+            vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
         }
+        else
+        {
+            Value* pStack = STACKSAVE();
 
-        STACKRESTORE(pStack);
-    }
-    return vGather;
-}
+            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
+            Value* vSrcPtr = ALLOCA(vSrc->getType());
+            STORE(vSrc, vSrcPtr);
 
-//////////////////////////////////////////////////////////////////////////
-/// @brief convert x86 <N x float> mask to llvm <N x i1> mask
-Value* Builder::MASK(Value* vmask)
-{
-    Value* src = BITCAST(vmask, mSimdInt32Ty);
-    return ICMP_SLT(src, VIMMED1(0));
-}
+            vGather = VUNDEF_I();
+            Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
+            Value *vOffsets = MUL(vIndices, vScaleVec);
+            Value *mask = MASK(vMask);
+            for(uint32_t i = 0; i < mVWidth; ++i)
+            {
+                // single component byte index
+                Value *offset = VEXTRACT(vOffsets, C(i));
+                // byte pointer to component
+                Value *loadAddress = GEP(pBase, offset);
+                loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
+                // pointer to the value to load if we're masking off a component
+                Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
+                Value *selMask = VEXTRACT(mask, C(i));
+                // switch in a safe address to load if we're trying to access a vertex 
+                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
+                Value *val = LOAD(validAddress, C(0));
+                vGather = VINSERT(vGather, val, C(i));
+            }
 
-//////////////////////////////////////////////////////////////////////////
-/// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
-Value* Builder::VMASK(Value* mask)
-{
-    return S_EXT(mask, mSimdInt32Ty);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Generate a VPSHUFB operation in LLVM IR.  If not  
-/// supported on the underlying platform, emulate it
-/// @param a - 256bit SIMD(32x8bit) of 8bit integer values
-/// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
-/// Byte masks in lower 128 lane of b selects 8 bit values from lower 
-/// 128bits of a, and vice versa for the upper lanes.  If the mask 
-/// value is negative, '0' is inserted.
-Value *Builder::PSHUFB(Value* a, Value* b)
-{
-    Value* res;
-    // use avx2 pshufb instruction if available
-    if(JM()->mArch.AVX2())
+            STACKRESTORE(pStack);
+        }
+        return vGather;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
+    Value* Builder::MASK(Value* vmask)
     {
-        res = VPSHUFB(a, b);
+        Value* src = BITCAST(vmask, mSimdInt32Ty);
+        return ICMP_SLT(src, VIMMED1(0));
     }
-    else
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
+    Value* Builder::VMASK(Value* mask)
     {
-        Constant* cB = dyn_cast<Constant>(b);
-        // number of 8 bit elements in b
-        uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
-        // output vector
-        Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
+        return S_EXT(mask, mSimdInt32Ty);
+    }
 
-        // insert an 8 bit value from the high and low lanes of a per loop iteration
-        numElms /= 2;
-        for(uint32_t i = 0; i < numElms; i++)
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate a VPSHUFB operation in LLVM IR.  If not  
+    /// supported on the underlying platform, emulate it
+    /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
+    /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
+    /// Byte masks in lower 128 lane of b selects 8 bit values from lower 
+    /// 128bits of a, and vice versa for the upper lanes.  If the mask 
+    /// value is negative, '0' is inserted.
+    Value *Builder::PSHUFB(Value* a, Value* b)
+    {
+        Value* res;
+        // use avx2 pshufb instruction if available
+        if(JM()->mArch.AVX2())
+        {
+            res = VPSHUFB(a, b);
+        }
+        else
         {
-            ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
-            ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
+            Constant* cB = dyn_cast<Constant>(b);
+            // number of 8 bit elements in b
+            uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
+            // output vector
+            Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
+
+            // insert an 8 bit value from the high and low lanes of a per loop iteration
+            numElms /= 2;
+            for(uint32_t i = 0; i < numElms; i++)
+            {
+                ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
+                ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
 
-            // extract values from constant mask
-            char valLow128bLane =  (char)(cLow128b->getSExtValue());
-            char valHigh128bLane = (char)(cHigh128b->getSExtValue());
+                // extract values from constant mask
+                char valLow128bLane =  (char)(cLow128b->getSExtValue());
+                char valHigh128bLane = (char)(cHigh128b->getSExtValue());
 
-            Value* insertValLow128b;
-            Value* insertValHigh128b;
+                Value* insertValLow128b;
+                Value* insertValHigh128b;
 
-            // if the mask value is negative, insert a '0' in the respective output position
-            // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
-            insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
-            insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
+                // if the mask value is negative, insert a '0' in the respective output position
+                // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
+                insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
+                insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
 
-            vShuf = VINSERT(vShuf, insertValLow128b, i);
-            vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
+                vShuf = VINSERT(vShuf, insertValLow128b, i);
+                vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
+            }
+            res = vShuf;
         }
-        res = vShuf;
+        return res;
     }
-    return res;
-}
 
-//////////////////////////////////////////////////////////////////////////
-/// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32 
-/// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
-/// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only 
-/// lower 8 values are used.
-Value *Builder::PMOVSXBD(Value* a)
-{
-    // llvm-3.9 removed the pmovsxbd intrinsic
-#if HAVE_LLVM < 0x309
-    // use avx2 byte sign extend instruction if available
-    if(JM()->mArch.AVX2())
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32 
+    /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
+    /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only 
+    /// lower 8 values are used.
+    Value *Builder::PMOVSXBD(Value* a)
     {
-        Function *pmovsxbd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd);
-        return CALL(pmovsxbd, std::initializer_list<Value*>{a});
-    }
-    else
-#endif
-    {
-        // VPMOVSXBD output type
-        Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
-        // Extract 8 values from 128bit lane and sign extend
-        return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
+        // llvm-3.9 removed the pmovsxbd intrinsic
+    #if HAVE_LLVM < 0x309
+        // use avx2 byte sign extend instruction if available
+        if(JM()->mArch.AVX2())
+        {
+            Function *pmovsxbd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd);
+            return CALL(pmovsxbd, std::initializer_list<Value*>{a});
+        }
+        else
+    #endif
+        {
+            // VPMOVSXBD output type
+            Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
+            // Extract 8 values from 128bit lane and sign extend
+            return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
+        }
     }
-}
 
-//////////////////////////////////////////////////////////////////////////
-/// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32 
-/// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
-/// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
-Value *Builder::PMOVSXWD(Value* a)
-{
-    // llvm-3.9 removed the pmovsxwd intrinsic
-#if HAVE_LLVM < 0x309
-    // use avx2 word sign extend if available
-    if(JM()->mArch.AVX2())
-    {
-        Function *pmovsxwd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd);
-        return CALL(pmovsxwd, std::initializer_list<Value*>{a});
-    }
-    else
-#endif
-    {
-        // VPMOVSXWD output type
-        Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
-        // Extract 8 values from 128bit lane and sign extend
-        return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Generate a VPERMD operation (shuffle 32 bit integer values 
-/// across 128 bit lanes) in LLVM IR.  If not supported on the underlying 
-/// platform, emulate it
-/// @param a - 256bit SIMD lane(8x32bit) of integer values.
-/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
-Value *Builder::PERMD(Value* a, Value* idx)
-{
-    Value* res;
-    // use avx2 permute instruction if available
-    if(JM()->mArch.AVX2())
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32 
+    /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
+    /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
+    Value *Builder::PMOVSXWD(Value* a)
     {
-        res = VPERMD(a, idx);
+        // llvm-3.9 removed the pmovsxwd intrinsic
+    #if HAVE_LLVM < 0x309
+        // use avx2 word sign extend if available
+        if(JM()->mArch.AVX2())
+        {
+            Function *pmovsxwd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd);
+            return CALL(pmovsxwd, std::initializer_list<Value*>{a});
+        }
+        else
+    #endif
+        {
+            // VPMOVSXWD output type
+            Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
+            // Extract 8 values from 128bit lane and sign extend
+            return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
+        }
     }
-    else
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate a VPERMD operation (shuffle 32 bit integer values 
+    /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying 
+    /// platform, emulate it
+    /// @param a - 256bit SIMD lane(8x32bit) of integer values.
+    /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
+    Value *Builder::PERMD(Value* a, Value* idx)
     {
-        if (isa<Constant>(idx))
+        Value* res;
+        // use avx2 permute instruction if available
+        if(JM()->mArch.AVX2())
         {
-            res = VSHUFFLE(a, a, idx);
+            res = VPERMD(a, idx);
         }
         else
         {
-            res = VUNDEF_I();
-            for (uint32_t l = 0; l < JM()->mVWidth; ++l)
+            if (isa<Constant>(idx))
+            {
+                res = VSHUFFLE(a, a, idx);
+            }
+            else
             {
-                Value* pIndex = VEXTRACT(idx, C(l));
-                Value* pVal = VEXTRACT(a, pIndex);
-                res = VINSERT(res, pVal, C(l));
+                res = VUNDEF_I();
+                for (uint32_t l = 0; l < JM()->mVWidth; ++l)
+                {
+                    Value* pIndex = VEXTRACT(idx, C(l));
+                    Value* pVal = VEXTRACT(a, pIndex);
+                    res = VINSERT(res, pVal, C(l));
+                }
             }
         }
+        return res;
     }
-    return res;
-}
 
-//////////////////////////////////////////////////////////////////////////
-/// @brief Generate a VPERMPS operation (shuffle 32 bit float values 
-/// across 128 bit lanes) in LLVM IR.  If not supported on the underlying 
-/// platform, emulate it
-/// @param a - 256bit SIMD lane(8x32bit) of float values.
-/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
-Value *Builder::PERMPS(Value* a, Value* idx)
-{
-    Value* res;
-    // use avx2 permute instruction if available
-    if (JM()->mArch.AVX2())
-    {
-        // llvm 3.6.0 swapped the order of the args to vpermd
-        res = VPERMPS(idx, a);
-    }
-    else
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate a VPERMPS operation (shuffle 32 bit float values 
+    /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying 
+    /// platform, emulate it
+    /// @param a - 256bit SIMD lane(8x32bit) of float values.
+    /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
+    Value *Builder::PERMPS(Value* a, Value* idx)
     {
-        if (isa<Constant>(idx))
+        Value* res;
+        // use avx2 permute instruction if available
+        if (JM()->mArch.AVX2())
         {
-            res = VSHUFFLE(a, a, idx);
+            // llvm 3.6.0 swapped the order of the args to vpermd
+            res = VPERMPS(idx, a);
         }
         else
         {
-            res = VUNDEF_F();
-            for (uint32_t l = 0; l < JM()->mVWidth; ++l)
+            if (isa<Constant>(idx))
             {
-                Value* pIndex = VEXTRACT(idx, C(l));
-                Value* pVal = VEXTRACT(a, pIndex);
-                res = VINSERT(res, pVal, C(l));
+                res = VSHUFFLE(a, a, idx);
+            }
+            else
+            {
+                res = VUNDEF_F();
+                for (uint32_t l = 0; l < JM()->mVWidth; ++l)
+                {
+                    Value* pIndex = VEXTRACT(idx, C(l));
+                    Value* pVal = VEXTRACT(a, pIndex);
+                    res = VINSERT(res, pVal, C(l));
+                }
             }
         }
-    }
 
-    return res;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
-/// in LLVM IR.  If not supported on the underlying platform, emulate it
-/// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
-Value *Builder::CVTPH2PS(Value* a)
-{
-    if (JM()->mArch.F16C())
-    {
-        return VCVTPH2PS(a);
+        return res;
     }
-    else
-    {
-        FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
-        Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
 
-        if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
+    /// in LLVM IR.  If not supported on the underlying platform, emulate it
+    /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
+    Value *Builder::CVTPH2PS(Value* a)
+    {
+        if (JM()->mArch.F16C())
         {
-            sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
+            return VCVTPH2PS(a);
         }
-
-        Value* pResult = UndefValue::get(mSimdFP32Ty);
-        for (uint32_t i = 0; i < mVWidth; ++i)
+        else
         {
-            Value* pSrc = VEXTRACT(a, C(i));
-            Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
-            pResult = VINSERT(pResult, pConv, C(i));
-        }
+            FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
+            Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
 
-        return pResult;
-    }
-}
+            if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
+            {
+                sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
+            }
 
-//////////////////////////////////////////////////////////////////////////
-/// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
-/// in LLVM IR.  If not supported on the underlying platform, emulate it
-/// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
-Value *Builder::CVTPS2PH(Value* a, Value* rounding)
-{
-    if (JM()->mArch.F16C())
-    {
-        return VCVTPS2PH(a, rounding);
-    }
-    else
-    {
-        // call scalar C function for now
-        FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
-        Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
+            Value* pResult = UndefValue::get(mSimdFP32Ty);
+            for (uint32_t i = 0; i < mVWidth; ++i)
+            {
+                Value* pSrc = VEXTRACT(a, C(i));
+                Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
+                pResult = VINSERT(pResult, pConv, C(i));
+            }
 
-        if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
-        {
-            sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
+            return pResult;
         }
+    }
 
-        Value* pResult = UndefValue::get(mSimdInt16Ty);
-        for (uint32_t i = 0; i < mVWidth; ++i)
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
+    /// in LLVM IR.  If not supported on the underlying platform, emulate it
+    /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
+    Value *Builder::CVTPS2PH(Value* a, Value* rounding)
+    {
+        if (JM()->mArch.F16C())
         {
-            Value* pSrc = VEXTRACT(a, C(i));
-            Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
-            pResult = VINSERT(pResult, pConv, C(i));
+            return VCVTPS2PH(a, rounding);
         }
+        else
+        {
+            // call scalar C function for now
+            FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
+            Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
 
-        return pResult;
-    }
-}
+            if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
+            {
+                sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
+            }
 
-Value *Builder::PMAXSD(Value* a, Value* b)
-{
-    // llvm-3.9 removed the pmax intrinsics
-#if HAVE_LLVM >= 0x309
-    Value* cmp = ICMP_SGT(a, b);
-    return SELECT(cmp, a, b);
-#else
-    if (JM()->mArch.AVX2())
-    {
-        Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d);
-        return CALL(pmaxsd, {a, b});
+            Value* pResult = UndefValue::get(mSimdInt16Ty);
+            for (uint32_t i = 0; i < mVWidth; ++i)
+            {
+                Value* pSrc = VEXTRACT(a, C(i));
+                Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
+                pResult = VINSERT(pResult, pConv, C(i));
+            }
+
+            return pResult;
+        }
     }
-    else
+
+    Value *Builder::PMAXSD(Value* a, Value* b)
     {
-        // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
-        Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
+        // llvm-3.9 removed the pmax intrinsics
+    #if HAVE_LLVM >= 0x309
+        Value* cmp = ICMP_SGT(a, b);
+        return SELECT(cmp, a, b);
+    #else
+        if (JM()->mArch.AVX2())
+        {
+            Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d);
+            return CALL(pmaxsd, {a, b});
+        }
+        else
+        {
+            // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
+            Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
 
-        // low 128
-        Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
-        Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
-        Value* resLo = CALL(pmaxsd, {aLo, bLo});
+            // low 128
+            Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
+            Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
+            Value* resLo = CALL(pmaxsd, {aLo, bLo});
 
-        // high 128
-        Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
-        Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
-        Value* resHi = CALL(pmaxsd, {aHi, bHi});
+            // high 128
+            Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
+            Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
+            Value* resHi = CALL(pmaxsd, {aHi, bHi});
 
-        // combine 
-        Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
-        result = VINSERTI128(result, resHi, C((uint8_t)1));
+            // combine 
+            Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
+            result = VINSERTI128(result, resHi, C((uint8_t)1));
 
-        return result;
+            return result;
+        }
+    #endif
     }
-#endif
-}
 
-Value *Builder::PMINSD(Value* a, Value* b)
-{
-    // llvm-3.9 removed the pmin intrinsics
-#if HAVE_LLVM >= 0x309
-    Value* cmp = ICMP_SLT(a, b);
-    return SELECT(cmp, a, b);
-#else
-    if (JM()->mArch.AVX2())
-    {
-        Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d);
-        return CALL(pminsd, {a, b});
-    }
-    else
+    Value *Builder::PMINSD(Value* a, Value* b)
     {
-        // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
-        Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
+        // llvm-3.9 removed the pmin intrinsics
+    #if HAVE_LLVM >= 0x309
+        Value* cmp = ICMP_SLT(a, b);
+        return SELECT(cmp, a, b);
+    #else
+        if (JM()->mArch.AVX2())
+        {
+            Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d);
+            return CALL(pminsd, {a, b});
+        }
+        else
+        {
+            // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
+            Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
 
-        // low 128
-        Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
-        Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
-        Value* resLo = CALL(pminsd, {aLo, bLo});
+            // low 128
+            Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
+            Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
+            Value* resLo = CALL(pminsd, {aLo, bLo});
 
-        // high 128
-        Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
-        Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
-        Value* resHi = CALL(pminsd, {aHi, bHi});
+            // high 128
+            Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
+            Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
+            Value* resHi = CALL(pminsd, {aHi, bHi});
 
-        // combine 
-        Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
-        result = VINSERTI128(result, resHi, C((uint8_t)1));
+            // combine 
+            Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
+            result = VINSERTI128(result, resHi, C((uint8_t)1));
 
-        return result;
+            return result;
+        }
+    #endif
     }
-#endif
-}
 
-void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, 
-                      Value* mask, Value* vGatherComponents[], bool bPackedOutput)
-{
-    const SWR_FORMAT_INFO &info = GetFormatInfo(format);
-    if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
+    void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, 
+                          Value* mask, Value* vGatherComponents[], bool bPackedOutput)
     {
-        // ensure our mask is the correct type
-        mask = BITCAST(mask, mSimdFP32Ty);
-        GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
+        const SWR_FORMAT_INFO &info = GetFormatInfo(format);
+        if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
+        {
+            // ensure our mask is the correct type
+            mask = BITCAST(mask, mSimdFP32Ty);
+            GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
+        }
+        else
+        {
+            // ensure our mask is the correct type
+            mask = BITCAST(mask, mSimdInt32Ty);
+            GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
+        }
     }
-    else
+
+    void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, 
+                            Value* mask, Value* vGatherComponents[], bool bPackedOutput)
     {
-        // ensure our mask is the correct type
-        mask = BITCAST(mask, mSimdInt32Ty);
-        GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
+        switch(info.bpp / info.numComps)
+        {
+            case 16: 
+            {
+                    Value* vGatherResult[2];
+                    Value *vMask;
+
+                    // TODO: vGatherMaskedVal
+                    Value* vGatherMaskedVal = VIMMED1((float)0);
+
+                    // always have at least one component out of x or y to fetch
+
+                    // save mask as it is zero'd out after each gather
+                    vMask = mask;
+
+                    vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+                    // e.g. result of first 8x32bit integer gather for 16bit components
+                    // 256i - 0    1    2    3    4    5    6    7
+                    //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+                    //
+
+                    // if we have at least one component out of x or y to fetch
+                    if(info.numComps > 2)
+                    {
+                        // offset base to the next components(zw) in the vertex to gather
+                        pSrcBase = GEP(pSrcBase, C((char)4));
+                        vMask = mask;
+
+                        vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+                        // e.g. result of second 8x32bit integer gather for 16bit components
+                        // 256i - 0    1    2    3    4    5    6    7
+                        //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
+                        //
+                    }
+                    else
+                    {
+                        vGatherResult[1] =  vGatherMaskedVal;
+                    }
+
+                    // Shuffle gathered components into place, each row is a component
+                    Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);  
+            }
+                break;
+            case 32: 
+            { 
+                // apply defaults
+                for (uint32_t i = 0; i < 4; ++i)
+                {
+                    vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
+                }
+
+                for(uint32_t i = 0; i < info.numComps; i++)
+                {
+                    uint32_t swizzleIndex = info.swizzle[i];
+
+                    // save mask as it is zero'd out after each gather
+                    Value *vMask = mask;
+
+                    // Gather a SIMD of components
+                    vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
+
+                    // offset base to the next component to gather
+                    pSrcBase = GEP(pSrcBase, C((char)4));
+                }
+            }
+                break;
+            default:
+                SWR_ASSERT(0, "Invalid float format");
+                break;
+        }
     }
-}
 
-void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, 
-                        Value* mask, Value* vGatherComponents[], bool bPackedOutput)
-{
-    switch(info.bpp / info.numComps)
+    void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+                            Value* mask, Value* vGatherComponents[], bool bPackedOutput)
     {
-        case 16: 
+        switch (info.bpp / info.numComps)
         {
+            case 8:
+            {
+                Value* vGatherMaskedVal = VIMMED1((int32_t)0);
+                Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
+                // e.g. result of an 8x32bit integer gather for 8bit components
+                // 256i - 0    1    2    3    4    5    6    7
+                //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 
+
+                Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);  
+            }
+                break;
+            case 16:
+            {
                 Value* vGatherResult[2];
                 Value *vMask;
 
                 // TODO: vGatherMaskedVal
-                Value* vGatherMaskedVal = VIMMED1((float)0);
+                Value* vGatherMaskedVal = VIMMED1((int32_t)0);
 
                 // always have at least one component out of x or y to fetch
 
                 // save mask as it is zero'd out after each gather
                 vMask = mask;
 
-                vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+                vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
                 // e.g. result of first 8x32bit integer gather for 16bit components
                 // 256i - 0    1    2    3    4    5    6    7
                 //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
@@ -998,7 +1087,7 @@ void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byt
                     pSrcBase = GEP(pSrcBase, C((char)4));
                     vMask = mask;
 
-                    vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+                    vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
                     // e.g. result of second 8x32bit integer gather for 16bit components
                     // 256i - 0    1    2    3    4    5    6    7
                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
@@ -1006,617 +1095,532 @@ void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byt
                 }
                 else
                 {
-                    vGatherResult[1] =  vGatherMaskedVal;
+                    vGatherResult[1] = vGatherMaskedVal;
                 }
 
                 // Shuffle gathered components into place, each row is a component
-                Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);  
-        }
-            break;
-        case 32: 
-        { 
-            // apply defaults
-            for (uint32_t i = 0; i < 4; ++i)
-            {
-                vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
-            }
+                Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 
-            for(uint32_t i = 0; i < info.numComps; i++)
+            }
+                break;
+            case 32:
             {
-                uint32_t swizzleIndex = info.swizzle[i];
+                // apply defaults
+                for (uint32_t i = 0; i < 4; ++i)
+                {
+                    vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
+                }
 
-                // save mask as it is zero'd out after each gather
-                Value *vMask = mask;
+                for(uint32_t i = 0; i < info.numComps; i++)
+                {
+                    uint32_t swizzleIndex = info.swizzle[i];
 
-                // Gather a SIMD of components
-                vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
+                    // save mask as it is zero'd out after each gather
+                    Value *vMask = mask;
 
-                // offset base to the next component to gather
-                pSrcBase = GEP(pSrcBase, C((char)4));
+                    // Gather a SIMD of components
+                    vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
+
+                    // offset base to the next component to gather
+                    pSrcBase = GEP(pSrcBase, C((char)4));
+                }
             }
-        }
-            break;
-        default:
-            SWR_ASSERT(0, "Invalid float format");
+                break;
+            default:
+                SWR_ASSERT(0, "unsupported format");
             break;
+        }
     }
-}
 
-void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
-                        Value* mask, Value* vGatherComponents[], bool bPackedOutput)
-{
-    switch (info.bpp / info.numComps)
+    void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
     {
-        case 8:
-        {
-            Value* vGatherMaskedVal = VIMMED1((int32_t)0);
-            Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
-            // e.g. result of an 8x32bit integer gather for 8bit components
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 
+        // cast types
+        Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+        Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 
-            Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);  
-        }
-            break;
-        case 16:
-        {
-            Value* vGatherResult[2];
-            Value *vMask;
+        // input could either be float or int vector; do shuffle work in int
+        vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
+        vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
 
-            // TODO: vGatherMaskedVal
-            Value* vGatherMaskedVal = VIMMED1((int32_t)0);
-
-            // always have at least one component out of x or y to fetch
+        if(bPackedOutput) 
+        {
+            Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 
-            // save mask as it is zero'd out after each gather
-            vMask = mask;
+            // shuffle mask
+            Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+                                         0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
+            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
+            // after pshufb: group components together in each 128bit lane
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
 
-            vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
-            // e.g. result of first 8x32bit integer gather for 16bit components
+            Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
+            // after PERMD: move and pack xy components into each 128bit lane
             // 256i - 0    1    2    3    4    5    6    7
-            //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
-            //
+            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
 
-            // if we have at least one component out of x or y to fetch
-            if(info.numComps > 2)
+            // do the same for zw components
+            Value* vi128ZW = nullptr;
+            if(info.numComps > 2) 
             {
-                // offset base to the next components(zw) in the vertex to gather
-                pSrcBase = GEP(pSrcBase, C((char)4));
-                vMask = mask;
-
-                vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
-                // e.g. result of second 8x32bit integer gather for 16bit components
-                // 256i - 0    1    2    3    4    5    6    7
-                //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
-                //
+                Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
+                vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
             }
-            else
+
+            for(uint32_t i = 0; i < 4; i++)
             {
-                vGatherResult[1] = vGatherMaskedVal;
-            }
+                uint32_t swizzleIndex = info.swizzle[i];
+                // todo: fixed for packed
+                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
+                if(i >= info.numComps)
+                {
+                    // set the default component val
+                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
+                    continue;
+                }
+
+                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
+                // if x or y, use vi128XY permute result, else use vi128ZW
+                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
 
-            // Shuffle gathered components into place, each row is a component
-            Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
+                // extract packed component 128 bit lanes 
+                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
+            }
 
         }
-            break;
-        case 32:
+        else 
         {
+            // pshufb masks for each component
+            Value* vConstMask[2];
+            // x/z shuffle mask
+            vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
+                                     0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
+
+            // y/w shuffle mask
+            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
+                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
+
+
+            // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
             // apply defaults
             for (uint32_t i = 0; i < 4; ++i)
             {
-                vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
+                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
             }
 
             for(uint32_t i = 0; i < info.numComps; i++)
             {
                 uint32_t swizzleIndex = info.swizzle[i];
 
-                // save mask as it is zero'd out after each gather
-                Value *vMask = mask;
-
-                // Gather a SIMD of components
-                vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
+                // select correct constMask for x/z or y/w pshufb
+                uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
+                // if x or y, use vi128XY permute result, else use vi128ZW
+                uint32_t selectedGather = (i < 2) ? 0 : 1;
 
-                // offset base to the next component to gather
-                pSrcBase = GEP(pSrcBase, C((char)4));
+                vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+                // after pshufb mask for x channel; z uses the same shuffle from the second gather
+                // 256i - 0    1    2    3    4    5    6    7
+                //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 
             }
         }
-            break;
-        default:
-            SWR_ASSERT(0, "unsupported format");
-        break;
     }
-}
-
-void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
-{
-    // cast types
-    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
-    Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
-
-    // input could either be float or int vector; do shuffle work in int
-    vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
-    vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
-
-    if(bPackedOutput) 
-    {
-        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
-
-        // shuffle mask
-        Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
-                                     0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
-        Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
-        // after pshufb: group components together in each 128bit lane
-        // 256i - 0    1    2    3    4    5    6    7
-        //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
-
-        Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
-        // after PERMD: move and pack xy components into each 128bit lane
-        // 256i - 0    1    2    3    4    5    6    7
-        //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
-
-        // do the same for zw components
-        Value* vi128ZW = nullptr;
-        if(info.numComps > 2) 
-        {
-            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
-            vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
-        }
 
-        for(uint32_t i = 0; i < 4; i++)
-        {
-            uint32_t swizzleIndex = info.swizzle[i];
-            // todo: fixed for packed
-            Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
-            if(i >= info.numComps)
-            {
-                // set the default component val
-                vGatherOutput[swizzleIndex] = vGatherMaskedVal;
-                continue;
-            }
-
-            // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
-            uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
-            // if x or y, use vi128XY permute result, else use vi128ZW
-            Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
-
-            // extract packed component 128 bit lanes 
-            vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
-        }
-
-    }
-    else 
+    void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
     {
-        // pshufb masks for each component
-        Value* vConstMask[2];
-        // x/z shuffle mask
-        vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
-                                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
-
-        // y/w shuffle mask
-        vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
-                                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
+        // cast types
+        Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+        Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
 
-
-        // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
-        // apply defaults
-        for (uint32_t i = 0; i < 4; ++i)
+        if(bPackedOutput)
         {
-            vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
-        }
-
-        for(uint32_t i = 0; i < info.numComps; i++)
-        {
-            uint32_t swizzleIndex = info.swizzle[i];
-
-            // select correct constMask for x/z or y/w pshufb
-            uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
-            // if x or y, use vi128XY permute result, else use vi128ZW
-            uint32_t selectedGather = (i < 2) ? 0 : 1;
-
-            vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
-            // after pshufb mask for x channel; z uses the same shuffle from the second gather
+            Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
+            // shuffle mask
+            Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+                                         0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
+            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
+            // after pshufb: group components together in each 128bit lane
             // 256i - 0    1    2    3    4    5    6    7
-            //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 
-        }
-    }
-}
+            //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
 
-void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
-{
-    // cast types
-    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
-    Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
-
-    if(bPackedOutput)
-    {
-        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
-        // shuffle mask
-        Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
-                                     0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
-        Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
-        // after pshufb: group components together in each 128bit lane
-        // 256i - 0    1    2    3    4    5    6    7
-        //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
-
-        Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
-        // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
-        // 256i - 0    1    2    3    4    5    6    7
-        //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
-
-        // do the same for zw components
-        Value* vi128ZW = nullptr;
-        if(info.numComps > 2) 
-        {
-            vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
-        }
+            Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
+            // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
 
-        // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
-        for(uint32_t i = 0; i < 4; i++)
-        {
-            uint32_t swizzleIndex = info.swizzle[i];
-            // todo: fix for packed
-            Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
-            if(i >= info.numComps)
+            // do the same for zw components
+            Value* vi128ZW = nullptr;
+            if(info.numComps > 2) 
             {
-                // set the default component val
-                vGatherOutput[swizzleIndex] = vGatherMaskedVal;
-                continue;
+                vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
             }
 
-            // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
-            uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 
-            // if x or y, use vi128XY permute result, else use vi128ZW
-            Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
+            // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+            for(uint32_t i = 0; i < 4; i++)
+            {
+                uint32_t swizzleIndex = info.swizzle[i];
+                // todo: fix for packed
+                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
+                if(i >= info.numComps)
+                {
+                    // set the default component val
+                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
+                    continue;
+                }
+
+                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 
+                // if x or y, use vi128XY permute result, else use vi128ZW
+                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
             
-            // sign extend
-            vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
-        }
-    }
-    // else zero extend
-    else{
-        // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
-        // apply defaults
-        for (uint32_t i = 0; i < 4; ++i)
-        {
-            vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
+                // sign extend
+                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
+            }
         }
-
-        for(uint32_t i = 0; i < info.numComps; i++){
-            uint32_t swizzleIndex = info.swizzle[i];
-
-            // pshufb masks for each component
-            Value* vConstMask;
-            switch(i)
+        // else zero extend
+        else{
+            // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
+            // apply defaults
+            for (uint32_t i = 0; i < 4; ++i)
             {
-                case 0:
-                    // x shuffle mask
-                    vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
-                                          0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
-                    break;
-                case 1:
-                    // y shuffle mask
-                    vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
-                                          1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
-                    break;
-                case 2:
-                    // z shuffle mask
-                    vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
-                                          2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
-                    break;
-                case 3:
-                    // w shuffle mask
-                    vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
-                                          3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
-                    break;
-                default:
-                    vConstMask = nullptr;
-                    break;
+                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
             }
 
-                vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
-                // after pshufb for x channel
-                // 256i - 0    1    2    3    4    5    6    7
-                //        x000 x000 x000 x000 x000 x000 x000 x000 
+            for(uint32_t i = 0; i < info.numComps; i++){
+                uint32_t swizzleIndex = info.swizzle[i];
+
+                // pshufb masks for each component
+                Value* vConstMask;
+                switch(i)
+                {
+                    case 0:
+                        // x shuffle mask
+                        vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
+                                              0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
+                        break;
+                    case 1:
+                        // y shuffle mask
+                        vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
+                                              1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
+                        break;
+                    case 2:
+                        // z shuffle mask
+                        vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
+                                              2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
+                        break;
+                    case 3:
+                        // w shuffle mask
+                        vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
+                                              3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
+                        break;
+                    default:
+                        vConstMask = nullptr;
+                        break;
+                }
+
+                    vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
+                    // after pshufb for x channel
+                    // 256i - 0    1    2    3    4    5    6    7
+                    //        x000 x000 x000 x000 x000 x000 x000 x000 
+            }
         }
     }
-}
 
-// Helper function to create alloca in entry block of function
-Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
-{
-    auto saveIP = IRB()->saveIP();
-    IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
-                          pFunc->getEntryBlock().begin());
-    Value* pAlloca = ALLOCA(pType);
-    IRB()->restoreIP(saveIP);
-    return pAlloca;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief emulates a scatter operation.
-/// @param pDst - pointer to destination 
-/// @param vSrc - vector of src data to scatter
-/// @param vOffsets - vector of byte offsets from pDst
-/// @param vMask - mask of valid lanes
-void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
-{
-    /* Scatter algorithm
+    // Helper function to create alloca in entry block of function
+    Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
+    {
+        auto saveIP = IRB()->saveIP();
+        IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
+                              pFunc->getEntryBlock().begin());
+        Value* pAlloca = ALLOCA(pType);
+        IRB()->restoreIP(saveIP);
+        return pAlloca;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief emulates a scatter operation.
+    /// @param pDst - pointer to destination 
+    /// @param vSrc - vector of src data to scatter
+    /// @param vOffsets - vector of byte offsets from pDst
+    /// @param vMask - mask of valid lanes
+    void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
+    {
+        /* Scatter algorithm
     
-       while(Index = BitScanForward(mask))
-            srcElem = srcVector[Index]
-            offsetElem = offsetVector[Index]
-            *(pDst + offsetElem) = srcElem
-            Update mask (&= ~(1<<Index)
+           while(Index = BitScanForward(mask))
+                srcElem = srcVector[Index]
+                offsetElem = offsetVector[Index]
+                *(pDst + offsetElem) = srcElem
+                Update mask (&= ~(1<<Index)
 
-    */
+        */
 
-    BasicBlock* pCurBB = IRB()->GetInsertBlock();
-    Function* pFunc = pCurBB->getParent();
-    Type* pSrcTy = vSrc->getType()->getVectorElementType();
+        BasicBlock* pCurBB = IRB()->GetInsertBlock();
+        Function* pFunc = pCurBB->getParent();
+        Type* pSrcTy = vSrc->getType()->getVectorElementType();
 
-    // Store vectors on stack
-    if (pScatterStackSrc == nullptr)
-    {
-        // Save off stack allocations and reuse per scatter. Significantly reduces stack
-        // requirements for shaders with a lot of scatters.
-        pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
-        pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
-    }
+        // Store vectors on stack
+        if (pScatterStackSrc == nullptr)
+        {
+            // Save off stack allocations and reuse per scatter. Significantly reduces stack
+            // requirements for shaders with a lot of scatters.
+            pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
+            pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
+        }
     
-    Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
-    Value* pOffsetsArrayPtr = pScatterStackOffsets;
-    STORE(vSrc, pSrcArrayPtr);
-    STORE(vOffsets, pOffsetsArrayPtr);
+        Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
+        Value* pOffsetsArrayPtr = pScatterStackOffsets;
+        STORE(vSrc, pSrcArrayPtr);
+        STORE(vOffsets, pOffsetsArrayPtr);
 
-    // Cast to pointers for random access
-    pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
-    pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
+        // Cast to pointers for random access
+        pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
+        pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
 
-    Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
+        Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
 
-    // Get cttz function
-    Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
+        // Get cttz function
+        Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
     
-    // Setup loop basic block
-    BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc);
+        // Setup loop basic block
+        BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc);
 
-    // compute first set bit
-    Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
+        // compute first set bit
+        Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
 
-    Value* pIsUndef = ICMP_EQ(pIndex, C(32));
+        Value* pIsUndef = ICMP_EQ(pIndex, C(32));
 
-    // Split current block
-    BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
+        // Split current block
+        BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
 
-    // Remove unconditional jump created by splitBasicBlock
-    pCurBB->getTerminator()->eraseFromParent();
+        // Remove unconditional jump created by splitBasicBlock
+        pCurBB->getTerminator()->eraseFromParent();
 
-    // Add terminator to end of original block
-    IRB()->SetInsertPoint(pCurBB);
+        // Add terminator to end of original block
+        IRB()->SetInsertPoint(pCurBB);
 
-    // Add conditional branch
-    COND_BR(pIsUndef, pPostLoop, pLoop);
+        // Add conditional branch
+        COND_BR(pIsUndef, pPostLoop, pLoop);
 
-    // Add loop basic block contents
-    IRB()->SetInsertPoint(pLoop);
-    PHINode* pIndexPhi = PHI(mInt32Ty, 2);
-    PHINode* pMaskPhi = PHI(mInt32Ty, 2);
+        // Add loop basic block contents
+        IRB()->SetInsertPoint(pLoop);
+        PHINode* pIndexPhi = PHI(mInt32Ty, 2);
+        PHINode* pMaskPhi = PHI(mInt32Ty, 2);
 
-    pIndexPhi->addIncoming(pIndex, pCurBB);
-    pMaskPhi->addIncoming(pMask, pCurBB);
+        pIndexPhi->addIncoming(pIndex, pCurBB);
+        pMaskPhi->addIncoming(pMask, pCurBB);
 
-    // Extract elements for this index
-    Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
-    Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
+        // Extract elements for this index
+        Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
+        Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
 
-    // GEP to this offset in dst
-    Value* pCurDst = GEP(pDst, pOffsetElem);
-    pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
-    STORE(pSrcElem, pCurDst);
+        // GEP to this offset in dst
+        Value* pCurDst = GEP(pDst, pOffsetElem);
+        pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
+        STORE(pSrcElem, pCurDst);
 
-    // Update the mask
-    Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
+        // Update the mask
+        Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
 
-    // Terminator
-    Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
+        // Terminator
+        Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
 
-    pIsUndef = ICMP_EQ(pNewIndex, C(32));
-    COND_BR(pIsUndef, pPostLoop, pLoop);
+        pIsUndef = ICMP_EQ(pNewIndex, C(32));
+        COND_BR(pIsUndef, pPostLoop, pLoop);
 
-    // Update phi edges
-    pIndexPhi->addIncoming(pNewIndex, pLoop);
-    pMaskPhi->addIncoming(pNewMask, pLoop);
+        // Update phi edges
+        pIndexPhi->addIncoming(pNewIndex, pLoop);
+        pMaskPhi->addIncoming(pNewMask, pLoop);
 
-    // Move builder to beginning of post loop
-    IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
-}
+        // Move builder to beginning of post loop
+        IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
+    }
 
-Value* Builder::VABSPS(Value* a)
-{
-    Value* asInt = BITCAST(a, mSimdInt32Ty);
-    Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
-    return result;
-}
+    Value* Builder::VABSPS(Value* a)
+    {
+        Value* asInt = BITCAST(a, mSimdInt32Ty);
+        Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
+        return result;
+    }
 
-Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
-{
-    Value *lowCmp = ICMP_SLT(src, low);
-    Value *ret = SELECT(lowCmp, low, src);
+    Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
+    {
+        Value *lowCmp = ICMP_SLT(src, low);
+        Value *ret = SELECT(lowCmp, low, src);
 
-    Value *highCmp = ICMP_SGT(ret, high);
-    ret = SELECT(highCmp, high, ret);
+        Value *highCmp = ICMP_SGT(ret, high);
+        ret = SELECT(highCmp, high, ret);
 
-    return ret;
-}
+        return ret;
+    }
 
-Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
-{
-    Value *lowCmp = FCMP_OLT(src, low);
-    Value *ret = SELECT(lowCmp, low, src);
+    Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
+    {
+        Value *lowCmp = FCMP_OLT(src, low);
+        Value *ret = SELECT(lowCmp, low, src);
 
-    Value *highCmp = FCMP_OGT(ret, high);
-    ret = SELECT(highCmp, high, ret);
+        Value *highCmp = FCMP_OGT(ret, high);
+        ret = SELECT(highCmp, high, ret);
 
-    return ret;
-}
+        return ret;
+    }
 
-Value *Builder::FCLAMP(Value* src, float low, float high)
-{
-    Value* result = VMAXPS(src, VIMMED1(low));
-    result = VMINPS(result, VIMMED1(high));
+    Value *Builder::FCLAMP(Value* src, float low, float high)
+    {
+        Value* result = VMAXPS(src, VIMMED1(low));
+        result = VMINPS(result, VIMMED1(high));
 
-    return result;
-}
+        return result;
+    }
 
-//////////////////////////////////////////////////////////////////////////
-/// @brief save/restore stack, providing ability to push/pop the stack and 
-///        reduce overall stack requirements for temporary stack use
-Value* Builder::STACKSAVE()
-{
-    Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
-#if HAVE_LLVM == 0x306
-    return CALL(pfnStackSave);
-#else
-    return CALLA(pfnStackSave);
-#endif
-}
-
-void Builder::STACKRESTORE(Value* pSaved)
-{
-    Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
-    CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
-}
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief save/restore stack, providing ability to push/pop the stack and 
+    ///        reduce overall stack requirements for temporary stack use
+    Value* Builder::STACKSAVE()
+    {
+        Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
+    #if HAVE_LLVM == 0x306
+        return CALL(pfnStackSave);
+    #else
+        return CALLA(pfnStackSave);
+    #endif
+    }
 
-Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
-{
-    Value* vOut;
-    // use FMADs if available
-    if(JM()->mArch.AVX2())
+    void Builder::STACKRESTORE(Value* pSaved)
+    {
+        Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
+        CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
+    }
+
+    Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
     {
-        vOut = VFMADDPS(a, b, c);
+        Value* vOut;
+        // use FMADs if available
+        if(JM()->mArch.AVX2())
+        {
+            vOut = VFMADDPS(a, b, c);
+        }
+        else
+        {
+            vOut = FADD(FMUL(a, b), c);
+        }
+        return vOut;
     }
-    else
+
+    Value* Builder::POPCNT(Value* a)
     {
-        vOut = FADD(FMUL(a, b), c);
+        Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
+        return CALL(pCtPop, std::initializer_list<Value*>{a});
     }
-    return vOut;
-}
 
-Value* Builder::POPCNT(Value* a)
-{
-    Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
-    return CALL(pCtPop, std::initializer_list<Value*>{a});
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief C functions called by LLVM IR
-//////////////////////////////////////////////////////////////////////////
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief called in JIT code, inserted by PRINT
-/// output to both stdout and visual studio debug console
-void __cdecl CallPrint(const char* fmt, ...)
-{
-    va_list args;
-    va_start(args, fmt);
-    vprintf(fmt, args);
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief C functions called by LLVM IR
+    //////////////////////////////////////////////////////////////////////////
 
-#if defined( _WIN32 )
-    char strBuf[1024];
-    vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
-    OutputDebugString(strBuf);
-#endif
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief called in JIT code, inserted by PRINT
+    /// output to both stdout and visual studio debug console
+    void __cdecl CallPrint(const char* fmt, ...)
+    {
+        va_list args;
+        va_start(args, fmt);
+        vprintf(fmt, args);
 
-    va_end(args);
-}
+    #if defined( _WIN32 )
+        char strBuf[1024];
+        vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
+        OutputDebugString(strBuf);
+    #endif
 
-Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
-{
-#if HAVE_LLVM == 0x306
-    Function *func =
-        Intrinsic::getDeclaration(JM()->mpCurrentModule,
-                                  Intrinsic::x86_avx_vextractf128_si_256);
-    return CALL(func, {a, imm8});
-#else
-    bool flag = !imm8->isZeroValue();
-    SmallVector<Constant*,8> idx;
-    for (unsigned i = 0; i < mVWidth / 2; i++) {
-        idx.push_back(C(flag ? i + mVWidth / 2 : i));
-    }
-    return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
-#endif
-}
-
-Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
-{
-#if HAVE_LLVM == 0x306
-    Function *func =
-        Intrinsic::getDeclaration(JM()->mpCurrentModule,
-                                  Intrinsic::x86_avx_vinsertf128_si_256);
-    return CALL(func, {a, b, imm8});
-#else
-    bool flag = !imm8->isZeroValue();
-    SmallVector<Constant*,8> idx;
-    for (unsigned i = 0; i < mVWidth; i++) {
-        idx.push_back(C(i));
-    }
-    Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
-
-    SmallVector<Constant*,8> idx2;
-    for (unsigned i = 0; i < mVWidth / 2; i++) {
-        idx2.push_back(C(flag ? i : i + mVWidth));
-    }
-    for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
-        idx2.push_back(C(flag ? i + mVWidth / 2 : i));
-    }
-    return VSHUFFLE(a, inter, ConstantVector::get(idx2));
-#endif
-}
-
-// rdtsc buckets macros
-void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
-{
-    // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
-    // buckets framework when single threaded
-    if (KNOB_SINGLE_THREADED)
-    {
-        std::vector<Type*> args{
-            PointerType::get(mInt32Ty, 0),   // pBucketMgr
-            mInt32Ty                        // id
-        };
-
-        FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
-        Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
-        if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
-        {
-            sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
+        va_end(args);
+    }
+
+    Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
+    {
+    #if HAVE_LLVM == 0x306
+        Function *func =
+            Intrinsic::getDeclaration(JM()->mpCurrentModule,
+                                      Intrinsic::x86_avx_vextractf128_si_256);
+        return CALL(func, {a, imm8});
+    #else
+        bool flag = !imm8->isZeroValue();
+        SmallVector<Constant*,8> idx;
+        for (unsigned i = 0; i < mVWidth / 2; i++) {
+            idx.push_back(C(flag ? i + mVWidth / 2 : i));
         }
+        return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
+    #endif
+    }
 
-        CALL(pFunc, { pBucketMgr, pId });
+    Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
+    {
+    #if HAVE_LLVM == 0x306
+        Function *func =
+            Intrinsic::getDeclaration(JM()->mpCurrentModule,
+                                      Intrinsic::x86_avx_vinsertf128_si_256);
+        return CALL(func, {a, b, imm8});
+    #else
+        bool flag = !imm8->isZeroValue();
+        SmallVector<Constant*,8> idx;
+        for (unsigned i = 0; i < mVWidth; i++) {
+            idx.push_back(C(i));
+        }
+        Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
+
+        SmallVector<Constant*,8> idx2;
+        for (unsigned i = 0; i < mVWidth / 2; i++) {
+            idx2.push_back(C(flag ? i : i + mVWidth));
+        }
+        for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
+            idx2.push_back(C(flag ? i + mVWidth / 2 : i));
+        }
+        return VSHUFFLE(a, inter, ConstantVector::get(idx2));
+    #endif
     }
-}
 
-void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
-{
-    // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
-    // buckets framework when single threaded
-    if (KNOB_SINGLE_THREADED)
-    {
-        std::vector<Type*> args{
-            PointerType::get(mInt32Ty, 0),   // pBucketMgr
-            mInt32Ty                        // id
-        };
-
-        FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
-        Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
-        if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
+    // rdtsc buckets macros
+    void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
+    {
+        // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
+        // buckets framework when single threaded
+        if (KNOB_SINGLE_THREADED)
         {
-            sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
+            std::vector<Type*> args{
+                PointerType::get(mInt32Ty, 0),   // pBucketMgr
+                mInt32Ty                        // id
+            };
+
+            FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
+            Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
+            if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
+            {
+                sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
+            }
+
+            CALL(pFunc, { pBucketMgr, pId });
         }
+    }
 
-        CALL(pFunc, { pBucketMgr, pId });
+    void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
+    {
+        // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
+        // buckets framework when single threaded
+        if (KNOB_SINGLE_THREADED)
+        {
+            std::vector<Type*> args{
+                PointerType::get(mInt32Ty, 0),   // pBucketMgr
+                mInt32Ty                        // id
+            };
+
+            FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
+            Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
+            if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
+            {
+                sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
+            }
+
+            CALL(pFunc, { pBucketMgr, pId });
+        }
     }
-}
 
+}
\ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 986eced15f7..bdd818b6b8f 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -35,6 +35,8 @@
 #include <tuple>
 
 //#define FETCH_DUMP_VERTEX 1
+using namespace llvm;
+using namespace SwrJit;
 
 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
index 9c00f2264e0..c6d09413211 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
@@ -259,7 +259,11 @@ def generate_gen_cpp(functions, output_file):
 
     output_lines += [
         '#include \"builder.h\"',
-        ''
+        '',
+        'namespace SwrJit',
+        '{',
+        '    using namespace llvm;',
+        '',
     ]
 
     for func in functions:
@@ -277,14 +281,14 @@ def generate_gen_cpp(functions, output_file):
             first_arg = False
 
         output_lines += [
-            '//////////////////////////////////////////////////////////////////////////',
-            '%sBuilder::%s(%s)' % (func['return'], name, func['args_nodefs']),
-            '{',
-            '   return IRB()->%s(%s);' % (func['name'], func_args),
-            '}',
+            '    //////////////////////////////////////////////////////////////////////////',
+            '    %sBuilder::%s(%s)' % (func['return'], name, func['args_nodefs']),
+            '    {',
+            '       return IRB()->%s(%s);' % (func['name'], func_args),
+            '    }',
             '',
         ]
-
+    output_lines.append('}')
     output_file.write('\n'.join(output_lines) + '\n')
 
 """
@@ -326,7 +330,11 @@ def generate_x86_cpp(output_file):
 
     output_lines += [
         '#include \"builder.h\"',
-        ''
+        '',
+        'namespace SwrJit',
+        '{',
+        '    using namespace llvm;',
+        '',
     ]
 
     for inst in intrinsics:
@@ -344,10 +352,10 @@ def generate_x86_cpp(output_file):
             first = False
 
         output_lines += [
-            '//////////////////////////////////////////////////////////////////////////',
-            'Value *Builder::%s(%s)' % (inst[0], args),
-            '{',
-            '    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::%s);' % inst[1],
+            '    //////////////////////////////////////////////////////////////////////////',
+            '    Value *Builder::%s(%s)' % (inst[0], args),
+            '    {',
+            '        Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::%s);' % inst[1],
         ]
         if inst[0] == "VPERMD":
             rev_args = ''
@@ -360,21 +368,22 @@ def generate_x86_cpp(output_file):
 
             output_lines += [
                 '#if (HAVE_LLVM == 0x306) && (LLVM_VERSION_PATCH == 0)',
-                '    return CALL(func, std::initializer_list<Value*>{%s});' % rev_args,
+                '        return CALL(func, std::initializer_list<Value*>{%s});' % rev_args,
                 '#else',
             ]
         output_lines += [
-            '    return CALL(func, std::initializer_list<Value*>{%s});' % pass_args,
+            '        return CALL(func, std::initializer_list<Value*>{%s});' % pass_args,
         ]
         if inst[0] == "VPERMD":
             output_lines += [
                 '#endif',
             ]
         output_lines += [
-            '}',
+            '    }',
             '',
         ]
 
+    output_lines.append('}')
     output_file.write('\n'.join(output_lines) + '\n')
 
 """
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
index d6babd39b6b..e88158c14d8 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
@@ -59,6 +59,10 @@ header = r"""
 
 #pragma once
 
+namespace SwrJit
+{
+    using namespace llvm;
+
 """
 
 """
@@ -120,7 +124,7 @@ def gen_llvm_type(type, name, postfix_name, is_pointer, is_pointer_pointer, is_a
     elif is_array:
         llvm_type = 'ArrayType::get(%s, %s)' % (llvm_type, array_count)
 
-    return ['    members.push_back( %s );    // %s' % (llvm_type, name)]
+    return ['        members.push_back( %s );    // %s' % (llvm_type, name)]
 
 """
 """
@@ -151,12 +155,12 @@ def gen_llvm_types(input_file, output_file):
                 struct_name = match.group(3).strip()
 
                 output_lines += [
-                    '//////////////////////////////////////////////////////////////////////////',
-                    '/// Generate LLVM type information for %s' % struct_name,
-                    'INLINE static StructType *Gen_%s%s(JitManager* pJitMgr)' % (struct_name, postfix_name),
-                    '{',
-                    '    LLVMContext& ctx = pJitMgr->mContext;',
-                    '    std::vector<Type*> members;',
+                    '    //////////////////////////////////////////////////////////////////////////',
+                    '    /// Generate LLVM type information for %s' % struct_name,
+                    '    INLINE static StructType *Gen_%s%s(JitManager* pJitMgr)' % (struct_name, postfix_name),
+                    '    {',
+                    '        LLVMContext& ctx = pJitMgr->mContext;',
+                    '        std::vector<Type*> members;',
                     '',
                 ]
 
@@ -309,16 +313,17 @@ def gen_llvm_types(input_file, output_file):
                     if (end_of_struct):
                         output_lines += [
                             '',
-                            '    return StructType::get(ctx, members, false);',
-                            '}',
+                            '        return StructType::get(ctx, members, false);',
+                            '    }',
                             '',
                         ]
 
                         for i in range(len(llvm_args)):
-                            output_lines.append('static const uint32_t %s%s_%s = %s;' % (struct_name, postfix_name, llvm_args[i], i))
+                            output_lines.append('    static const uint32_t %s%s_%s = %s;' % (struct_name, postfix_name, llvm_args[i], i))
 
                         output_lines.append('')
 
+    output_lines.append('}')
     output_file.write('\n'.join(output_lines) + '\n')
 
 """
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
index 289422b11e1..c4fb3724c7c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
@@ -36,6 +36,9 @@
 #include <sstream>
 #include <unordered_set>
 
+using namespace llvm;
+using namespace SwrJit;
+
 //////////////////////////////////////////////////////////////////////////
 /// Interface to Jitting a fetch shader
 //////////////////////////////////////////////////////////////////////////
diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp
index ecb4545d13b..38a916e272c 100644
--- a/src/gallium/drivers/swr/swr_shader.cpp
+++ b/src/gallium/drivers/swr/swr_shader.cpp
@@ -44,6 +44,8 @@
 #include "swr_state.h"
 #include "swr_screen.h"
 
+using namespace SwrJit;
+
 static unsigned
 locate_linkage(ubyte name, ubyte index, struct tgsi_shader_info *info);
 
diff --git a/src/gallium/drivers/swr/swr_tex_sample.cpp b/src/gallium/drivers/swr/swr_tex_sample.cpp
index 180ade14bc6..6eb5ea67733 100644
--- a/src/gallium/drivers/swr/swr_tex_sample.cpp
+++ b/src/gallium/drivers/swr/swr_tex_sample.cpp
@@ -60,6 +60,7 @@
 #include "swr_tex_sample.h"
 #include "swr_context_llvm.h"
 
+using namespace SwrJit;
 
 /**
  * This provides the bridge between the sampler state store in
-- 
2.34.1