llvmpipe: SoA blending.
authorJosé Fonseca <jfonseca@vmware.com>
Sun, 9 Aug 2009 11:39:38 +0000 (12:39 +0100)
committerJosé Fonseca <jfonseca@vmware.com>
Sat, 29 Aug 2009 08:21:27 +0000 (09:21 +0100)
Throughput seems to be 4x higher.

src/gallium/drivers/llvmpipe/SConscript
src/gallium/drivers/llvmpipe/lp_bld.h
src/gallium/drivers/llvmpipe/lp_bld_blend.h [new file with mode: 0644]
src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c [moved from src/gallium/drivers/llvmpipe/lp_bld_blend.c with 80% similarity]
src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c [new file with mode: 0644]
src/gallium/drivers/llvmpipe/lp_test_blend.c

index aca4f21..0a8e6e8 100644 (file)
@@ -11,6 +11,8 @@ llvmpipe = env.ConvenienceLibrary(
                'lp_fs_sse.c',
                'lp_fs_llvm.c',
                'lp_bld_arit.c',
+               'lp_bld_blend_aos.c',
+               'lp_bld_blend_soa.c',
                'lp_bld_const.c',
                'lp_bld_conv.c',
                'lp_bld_intr.c',
@@ -20,7 +22,6 @@ llvmpipe = env.ConvenienceLibrary(
                'lp_bld_store.c',
                'lp_bld_loop.c',
                'lp_bld_logicop.c',
-               'lp_bld_blend.c',
                'lp_bld_swizzle.c',
                'lp_bld_type.c',
                'lp_clear.c',
index e9d9c25..a725cbb 100644 (file)
@@ -45,7 +45,6 @@
 #include "pipe/p_format.h"
 
 
-struct pipe_blend_state;
 union lp_type;
 
 
@@ -132,14 +131,4 @@ lp_build_logicop(LLVMBuilderRef builder,
                  LLVMValueRef dst);
 
 
-LLVMValueRef
-lp_build_blend(LLVMBuilderRef builder,
-               const struct pipe_blend_state *blend,
-               union lp_type type,
-               LLVMValueRef src,
-               LLVMValueRef dst,
-               LLVMValueRef const_,
-               unsigned alpha_swizzle);
-
-
 #endif /* !LP_BLD_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.h b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
new file mode 100644 (file)
index 0000000..36f53da
--- /dev/null
@@ -0,0 +1,94 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef LP_BLD_BLEND_H
+#define LP_BLD_BLEND_H
+
+
+/**
+ * @file
+ * LLVM IR building helpers interfaces.
+ *
+ * We use LLVM-C bindings for now. They are not documented, but follow the C++
+ * interfaces very closely, and appear to be complete enough for code
+ * genration. See
+ * http://npcontemplation.blogspot.com/2008/06/secret-of-llvm-c-bindings.html
+ * for a standalone example.
+ */
+
+#include <llvm-c/Core.h>  
+#include "pipe/p_format.h"
+
+
+struct pipe_blend_state;
+union lp_type;
+struct lp_build_context;
+
+
+/**
+ * Whether the blending function is commutative or not.
+ */
+boolean
+lp_build_blend_func_commutative(unsigned func);
+
+
+/**
+ * Whether the blending functions are the reverse of each other.
+ */
+boolean
+lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func);
+
+
+LLVMValueRef
+lp_build_blend_func(struct lp_build_context *bld,
+                    unsigned func,
+                    LLVMValueRef term1,
+                    LLVMValueRef term2);
+
+
+LLVMValueRef
+lp_build_blend_aos(LLVMBuilderRef builder,
+                   const struct pipe_blend_state *blend,
+                   union lp_type type,
+                   LLVMValueRef src,
+                   LLVMValueRef dst,
+                   LLVMValueRef const_,
+                   unsigned alpha_swizzle);
+
+
+void
+lp_build_blend_soa(LLVMBuilderRef builder,
+                   const struct pipe_blend_state *blend,
+                   union lp_type type,
+                   LLVMValueRef src[4],
+                   LLVMValueRef dst[4],
+                   LLVMValueRef const_[4],
+                   LLVMValueRef res[4]);
+
+
+#endif /* !LP_BLD_BLEND_H */
 
 /**
  * @file
- * Blend LLVM IR generation.
- *
- * This code is generic -- it should be able to cope both with floating point
- * and integer inputs in AOS form.
+ * Blend LLVM IR generation -- AOS form.
  *
  * @author Jose Fonseca <jfonseca@vmware.com>
  */
 
 #include "pipe/p_state.h"
 
-#include "lp_bld.h"
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_arit.h"
 #include "lp_bld_swizzle.h"
+#include "lp_bld_blend.h"
 
 
 /**
@@ -51,7 +48,7 @@
  * recomputing them. Also reusing the values allows us to do simplifications
  * that LLVM optimization passes wouldn't normally be able to do.
  */
-struct lp_build_blend_context
+struct lp_build_blend_aos_context
 {
    struct lp_build_context base;
    
@@ -72,7 +69,7 @@ struct lp_build_blend_context
 
 
 static LLVMValueRef
-lp_build_blend_factor_unswizzled(struct lp_build_blend_context *bld,
+lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
                                  unsigned factor,
                                  boolean alpha)
 {
@@ -174,7 +171,7 @@ lp_build_blend_factor_swizzle(unsigned factor)
 
 
 static LLVMValueRef
-lp_build_blend_swizzle(struct lp_build_blend_context *bld,
+lp_build_blend_swizzle(struct lp_build_blend_aos_context *bld,
                        LLVMValueRef rgb, 
                        LLVMValueRef alpha, 
                        enum lp_build_blend_swizzle rgb_swizzle,
@@ -211,7 +208,7 @@ lp_build_blend_swizzle(struct lp_build_blend_context *bld,
  * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendFuncSeparate.xml
  */
 static LLVMValueRef
-lp_build_blend_factor(struct lp_build_blend_context *bld,
+lp_build_blend_factor(struct lp_build_blend_aos_context *bld,
                       LLVMValueRef factor1,
                       unsigned rgb_factor,
                       unsigned alpha_factor,
@@ -233,44 +230,75 @@ lp_build_blend_factor(struct lp_build_blend_context *bld,
 }
 
 
+boolean
+lp_build_blend_func_commutative(unsigned func)
+{
+   switch (func) {
+   case PIPE_BLEND_ADD:
+   case PIPE_BLEND_MIN:
+   case PIPE_BLEND_MAX:
+      return TRUE;
+   case PIPE_BLEND_SUBTRACT:
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return FALSE;
+   default:
+      assert(0);
+      return TRUE;
+   }
+}
+
+
+boolean
+lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func)
+{
+   if(rgb_func == alpha_func)
+      return FALSE;
+   if(rgb_func == PIPE_BLEND_SUBTRACT && alpha_func == PIPE_BLEND_REVERSE_SUBTRACT)
+      return TRUE;
+   if(rgb_func == PIPE_BLEND_REVERSE_SUBTRACT && alpha_func == PIPE_BLEND_SUBTRACT)
+      return TRUE;
+   return FALSE;
+}
+
+
 /**
  * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendEquationSeparate.xml
  */
-static LLVMValueRef
-lp_build_blend_func(struct lp_build_blend_context *bld,
+LLVMValueRef
+lp_build_blend_func(struct lp_build_context *bld,
                     unsigned func,
                     LLVMValueRef term1, 
                     LLVMValueRef term2)
 {
    switch (func) {
    case PIPE_BLEND_ADD:
-      return lp_build_add(&bld->base, term1, term2);
+      return lp_build_add(bld, term1, term2);
       break;
    case PIPE_BLEND_SUBTRACT:
-      return lp_build_sub(&bld->base, term1, term2);
+      return lp_build_sub(bld, term1, term2);
    case PIPE_BLEND_REVERSE_SUBTRACT:
-      return lp_build_sub(&bld->base, term2, term1);
+      return lp_build_sub(bld, term2, term1);
    case PIPE_BLEND_MIN:
-      return lp_build_min(&bld->base, term1, term2);
+      return lp_build_min(bld, term1, term2);
    case PIPE_BLEND_MAX:
-      return lp_build_max(&bld->base, term1, term2);
+      return lp_build_max(bld, term1, term2);
    default:
       assert(0);
-      return bld->base.zero;
+      return bld->zero;
    }
 }
 
 
 LLVMValueRef
-lp_build_blend(LLVMBuilderRef builder,
-               const struct pipe_blend_state *blend,
-               union lp_type type,
-               LLVMValueRef src,
-               LLVMValueRef dst,
-               LLVMValueRef const_,
-               unsigned alpha_swizzle)
+lp_build_blend_aos(LLVMBuilderRef builder,
+                   const struct pipe_blend_state *blend,
+                   union lp_type type,
+                   LLVMValueRef src,
+                   LLVMValueRef dst,
+                   LLVMValueRef const_,
+                   unsigned alpha_swizzle)
 {
-   struct lp_build_blend_context bld;
+   struct lp_build_blend_aos_context bld;
    LLVMValueRef src_term;
    LLVMValueRef dst_term;
 
@@ -284,8 +312,8 @@ lp_build_blend(LLVMBuilderRef builder,
    bld.dst = dst;
    bld.const_ = const_;
 
-   /* TODO: There are still a few optimization oportunities here. For certain
-    * combinations it is possible to reorder the operations and therefor saving
+   /* TODO: There are still a few optimization opportunities here. For certain
+    * combinations it is possible to reorder the operations and therefore saving
     * some instructions. */
 
    src_term = lp_build_blend_factor(&bld, src, blend->rgb_src_factor, blend->alpha_src_factor, alpha_swizzle);
@@ -297,7 +325,7 @@ lp_build_blend(LLVMBuilderRef builder,
 #endif
 
    if(blend->rgb_func == blend->alpha_func) {
-      return lp_build_blend_func(&bld, blend->rgb_func, src_term, dst_term);
+      return lp_build_blend_func(&bld.base, blend->rgb_func, src_term, dst_term);
    }
    else {
       /* Seperate RGB / A functions */
@@ -305,8 +333,8 @@ lp_build_blend(LLVMBuilderRef builder,
       LLVMValueRef rgb;
       LLVMValueRef alpha;
 
-      rgb   = lp_build_blend_func(&bld, blend->rgb_func,   src_term, dst_term);
-      alpha = lp_build_blend_func(&bld, blend->alpha_func, src_term, dst_term);
+      rgb   = lp_build_blend_func(&bld.base, blend->rgb_func,   src_term, dst_term);
+      alpha = lp_build_blend_func(&bld.base, blend->alpha_func, src_term, dst_term);
 
       return lp_build_blend_swizzle(&bld, rgb, alpha, LP_BUILD_BLEND_SWIZZLE_RGBA, alpha_swizzle);
    }
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
new file mode 100644 (file)
index 0000000..1ef1718
--- /dev/null
@@ -0,0 +1,237 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Blend LLVM IR generation -- SoA.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "pipe/p_state.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_blend.h"
+
+
+/**
+ * We may the same values several times, so we keep them here to avoid
+ * recomputing them. Also reusing the values allows us to do simplifications
+ * that LLVM optimization passes wouldn't normally be able to do.
+ */
+struct lp_build_blend_soa_context
+{
+   struct lp_build_context base;
+
+   LLVMValueRef src[4];
+   LLVMValueRef dst[4];
+   LLVMValueRef con[4];
+
+   LLVMValueRef inv_src[4];
+   LLVMValueRef inv_dst[4];
+   LLVMValueRef inv_con[4];
+
+   LLVMValueRef src_alpha_saturate;
+
+   /**
+    * We store all factors in a table in order to eliminate redundant
+    * multiplications later.
+    */
+   LLVMValueRef factor[2][8];
+
+   /**
+    * Table with all terms.
+    */
+   LLVMValueRef term[8];
+};
+
+
+static LLVMValueRef
+lp_build_blend_soa_factor(struct lp_build_blend_soa_context *bld,
+                          unsigned factor, unsigned i)
+{
+   /*
+    * Compute src/first term RGB
+    */
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      return bld->base.one;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      return bld->src[i];
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return bld->src[3];
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      return bld->dst[i];
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      return bld->dst[3];
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      if(i == 3)
+         return bld->base.one;
+      else {
+         if(!bld->inv_dst[3])
+            bld->inv_dst[3] = lp_build_comp(&bld->base, bld->dst[3]);
+         if(!bld->src_alpha_saturate)
+            bld->src_alpha_saturate = lp_build_min(&bld->base, bld->src[3], bld->inv_dst[3]);
+         return bld->src_alpha_saturate;
+      }
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      return bld->con[i];
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return bld->con[3];
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_ZERO:
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      if(!bld->inv_src[i])
+         bld->inv_src[i] = lp_build_comp(&bld->base, bld->src[i]);
+      return bld->inv_src[i];
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      if(!bld->inv_src[3])
+         bld->inv_src[3] = lp_build_comp(&bld->base, bld->src[3]);
+      return bld->inv_src[3];
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      if(!bld->inv_dst[i])
+         bld->inv_dst[i] = lp_build_comp(&bld->base, bld->dst[i]);
+      return bld->inv_dst[i];
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      if(!bld->inv_dst[3])
+         bld->inv_dst[3] = lp_build_comp(&bld->base, bld->dst[3]);
+      return bld->inv_dst[3];
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      if(!bld->inv_con[i])
+         bld->inv_con[i] = lp_build_comp(&bld->base, bld->con[i]);
+      return bld->inv_con[i];
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      if(!bld->inv_con[3])
+         bld->inv_con[3] = lp_build_comp(&bld->base, bld->con[3]);
+      return bld->inv_con[3];
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   default:
+      assert(0);
+      return bld->base.zero;
+   }
+}
+
+
+void
+lp_build_blend_soa(LLVMBuilderRef builder,
+                   const struct pipe_blend_state *blend,
+                   union lp_type type,
+                   LLVMValueRef src[4],
+                   LLVMValueRef dst[4],
+                   LLVMValueRef con[4],
+                   LLVMValueRef res[4])
+{
+   struct lp_build_blend_soa_context bld;
+   unsigned i, j;
+
+   /* Setup build context */
+   memset(&bld, 0, sizeof bld);
+   lp_build_context_init(&bld.base, builder, type);
+   for (i = 0; i < 4; ++i) {
+      bld.src[i] = src[i];
+      bld.dst[i] = dst[i];
+      bld.con[i] = con[i];
+   }
+
+   /*
+    * Compute src/dst factors.
+    */
+   for (i = 0; i < 4; ++i) {
+      unsigned src_factor = i < 3 ? blend->rgb_src_factor : blend->alpha_src_factor;
+      unsigned dst_factor = i < 3 ? blend->rgb_dst_factor : blend->alpha_dst_factor;
+      bld.factor[0][0 + i] = src[i];
+      bld.factor[1][0 + i] = lp_build_blend_soa_factor(&bld, src_factor, i);
+      bld.factor[0][4 + i] = dst[i];
+      bld.factor[1][4 + i] = lp_build_blend_soa_factor(&bld, dst_factor, i);
+   }
+
+   /*
+    * Compute src/dst terms
+    */
+   for (i = 0; i < 8; ++i) {
+
+      /* See if this multiplication has been previously computed */
+      for(j = 0; j < i; ++j) {
+         if((bld.factor[0][j] == bld.factor[0][i] &&
+             bld.factor[1][j] == bld.factor[1][i]) ||
+            (bld.factor[0][j] == bld.factor[1][i] &&
+             bld.factor[1][j] == bld.factor[0][i]))
+            break;
+      }
+
+      if(j < i)
+         bld.term[i] = bld.term[j];
+      else
+         bld.term[i] = lp_build_mul(&bld.base, bld.factor[0][i], bld.factor[1][i]);
+   }
+
+   /*
+    * Combine terms
+    */
+   for (i = 0; i < 4; ++i) {
+      unsigned func = i < 3 ? blend->rgb_func : blend->alpha_func;
+      boolean func_commutative = lp_build_blend_func_commutative(func);
+
+      /* See if this function has been previously applied */
+      for(j = 0; j < i; ++j) {
+         unsigned prev_func = j < 3 ? blend->rgb_func : blend->alpha_func;
+         unsigned func_reverse = lp_build_blend_func_reverse(func, prev_func);
+
+         if((!func_reverse &&
+             bld.factor[0 + j] == bld.factor[0 + i] &&
+             bld.factor[4 + j] == bld.factor[4 + i]) ||
+            ((func_commutative || func_reverse) &&
+             bld.factor[0 + j] == bld.factor[4 + i] &&
+             bld.factor[4 + j] == bld.factor[0 + i]))
+            break;
+      }
+
+      if(j < i)
+         res[i] = res[j];
+      else
+         res[i] = lp_build_blend_func(&bld.base, func, bld.term[i + 0], bld.term[i + 4]);
+   }
+}
index c8901fe..8bf5508 100644 (file)
  */
 
 
-#include "lp_bld.h"
 #include "lp_bld_type.h"
 #include "lp_bld_arit.h"
+#include "lp_bld_blend.h"
 #include "lp_test.h"
 
 
+enum vector_mode
+{
+   AoS = 0,
+   SoA = 1
+};
+
+
 typedef void (*blend_test_ptr_t)(const void *src, const void *dst, const void *con, void *res);
 
 
@@ -52,6 +59,7 @@ write_tsv_header(FILE *fp)
    fprintf(fp,
            "result\t"
            "cycles_per_channel\t"
+           "mode\t"
            "type\t"
            "sep_func\t"
            "sep_src_factor\t"
@@ -70,13 +78,22 @@ write_tsv_header(FILE *fp)
 static void
 write_tsv_row(FILE *fp,
               const struct pipe_blend_state *blend,
+              enum vector_mode mode,
               union lp_type type,
               double cycles,
               boolean success)
 {
    fprintf(fp, "%s\t", success ? "pass" : "fail");
 
-   fprintf(fp, "%.1f\t", cycles / type.length);
+   if (mode == AoS) {
+      fprintf(fp, "%.1f\t", cycles / type.length);
+      fprintf(fp, "aos\t");
+   }
+
+   if (mode == SoA) {
+      fprintf(fp, "%.1f\t", cycles / (4 * type.length));
+      fprintf(fp, "soa\t");
+   }
 
    fprintf(fp, "%s%u%sx%u\t",
            type.floating ? "f" : (type.fixed ? "h" : (type.sign ? "s" : "u")),
@@ -106,10 +123,19 @@ write_tsv_row(FILE *fp,
 static void
 dump_blend_type(FILE *fp,
                 const struct pipe_blend_state *blend,
+                enum vector_mode mode,
                 union lp_type type)
 {
+   fprintf(fp, "%s", mode ? "soa" : "aos");
+
+   fprintf(fp, " type=%s%u%sx%u",
+           type.floating ? "f" : (type.fixed ? "h" : (type.sign ? "s" : "u")),
+           type.width,
+           type.norm ? "n" : "",
+           type.length);
+
    fprintf(fp,
-           "%s=%s %s=%s %s=%s %s=%s %s=%s %s=%s",
+           " %s=%s %s=%s %s=%s %s=%s %s=%s %s=%s",
            "rgb_func",         debug_dump_blend_func(blend->rgb_func, TRUE),
            "rgb_src_factor",   debug_dump_blend_factor(blend->rgb_src_factor, TRUE),
            "rgb_dst_factor",   debug_dump_blend_factor(blend->rgb_dst_factor, TRUE),
@@ -117,12 +143,6 @@ dump_blend_type(FILE *fp,
            "alpha_src_factor", debug_dump_blend_factor(blend->alpha_src_factor, TRUE),
            "alpha_dst_factor", debug_dump_blend_factor(blend->alpha_dst_factor, TRUE));
 
-   fprintf(fp, " type=%s%u%sx%u",
-           type.floating ? "f" : (type.fixed ? "h" : (type.sign ? "s" : "u")),
-           type.width,
-           type.norm ? "n" : "",
-           type.length);
-
    fprintf(fp, " ...\n");
    fflush(fp);
 }
@@ -131,6 +151,7 @@ dump_blend_type(FILE *fp,
 static LLVMValueRef
 add_blend_test(LLVMModuleRef module,
                const struct pipe_blend_state *blend,
+               enum vector_mode mode,
                union lp_type type)
 {
    LLVMTypeRef ret_type;
@@ -143,10 +164,6 @@ add_blend_test(LLVMModuleRef module,
    LLVMValueRef res_ptr;
    LLVMBasicBlockRef block;
    LLVMBuilderRef builder;
-   LLVMValueRef src;
-   LLVMValueRef dst;
-   LLVMValueRef con;
-   LLVMValueRef res;
 
    ret_type = LLVMInt64Type();
    vec_type = lp_build_vec_type(type);
@@ -163,15 +180,51 @@ add_blend_test(LLVMModuleRef module,
    builder = LLVMCreateBuilder();
    LLVMPositionBuilderAtEnd(builder, block);
 
-   src = LLVMBuildLoad(builder, src_ptr, "src");
-   dst = LLVMBuildLoad(builder, dst_ptr, "dst");
-   con = LLVMBuildLoad(builder, const_ptr, "const");
+   if (mode == AoS) {
+      LLVMValueRef src;
+      LLVMValueRef dst;
+      LLVMValueRef con;
+      LLVMValueRef res;
+
+      src = LLVMBuildLoad(builder, src_ptr, "src");
+      dst = LLVMBuildLoad(builder, dst_ptr, "dst");
+      con = LLVMBuildLoad(builder, const_ptr, "const");
 
-   res = lp_build_blend(builder, blend, type, src, dst, con, 3);
+      res = lp_build_blend_aos(builder, blend, type, src, dst, con, 3);
 
-   LLVMSetValueName(res, "res");
+      LLVMSetValueName(res, "res");
 
-   LLVMBuildStore(builder, res, res_ptr);
+      LLVMBuildStore(builder, res, res_ptr);
+   }
+
+   if (mode == SoA) {
+      LLVMValueRef src[4];
+      LLVMValueRef dst[4];
+      LLVMValueRef con[4];
+      LLVMValueRef res[4];
+      char src_name[5] = "src?";
+      char dst_name[5] = "dst?";
+      char con_name[5] = "con?";
+      char res_name[5] = "res?";
+      unsigned i;
+
+      for(i = 0; i < 4; ++i) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+         con_name[3] = dst_name[3] = src_name[3] = "rgba"[i];
+         src[i] = LLVMBuildLoad(builder, LLVMBuildGEP(builder, src_ptr, &index, 1, ""), src_name);
+         dst[i] = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dst_ptr, &index, 1, ""), dst_name);
+         con[i] = LLVMBuildLoad(builder, LLVMBuildGEP(builder, const_ptr, &index, 1, ""), con_name);
+      }
+
+      lp_build_blend_soa(builder, blend, type, src, dst, con, res);
+
+      for(i = 0; i < 4; ++i) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+         res_name[3] = "rgba"[i];
+         LLVMSetValueName(res[i], res_name);
+         LLVMBuildStore(builder, res[i], LLVMBuildGEP(builder, res_ptr, &index, 1, ""));
+      }
+   }
 
    LLVMBuildRetVoid(builder);;
 
@@ -415,6 +468,7 @@ static boolean
 test_one(unsigned verbose,
          FILE *fp,
          const struct pipe_blend_state *blend,
+         enum vector_mode mode,
          union lp_type type)
 {
    LLVMModuleRef module = NULL;
@@ -431,11 +485,11 @@ test_one(unsigned verbose,
    unsigned i, j;
 
    if(verbose >= 1)
-      dump_blend_type(stdout, blend, type);
+      dump_blend_type(stdout, blend, mode, type);
 
    module = LLVMModuleCreateWithName("test");
 
-   func = add_blend_test(module, blend, type);
+   func = add_blend_test(module, blend, mode, type);
 
    if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
       LLVMDumpModule(module);
@@ -446,7 +500,7 @@ test_one(unsigned verbose,
    provider = LLVMCreateModuleProviderForExistingModule(module);
    if (LLVMCreateJITCompiler(&engine, provider, 1, &error)) {
       if(verbose < 1)
-         dump_blend_type(stderr, blend, type);
+         dump_blend_type(stderr, blend, mode, type);
       fprintf(stderr, "%s\n", error);
       LLVMDisposeMessage(error);
       abort();
@@ -474,66 +528,148 @@ test_one(unsigned verbose,
 
    success = TRUE;
    for(i = 0; i < n && success; ++i) {
-      uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-      uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-      uint8_t con[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-      uint8_t res[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-      uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-      int64_t start_counter = 0;
-      int64_t end_counter = 0;
-
-      random_vec(type, src);
-      random_vec(type, dst);
-      random_vec(type, con);
-
-      {
-         double fsrc[LP_MAX_VECTOR_LENGTH];
-         double fdst[LP_MAX_VECTOR_LENGTH];
-         double fcon[LP_MAX_VECTOR_LENGTH];
-         double fref[LP_MAX_VECTOR_LENGTH];
-
-         read_vec(type, src, fsrc);
-         read_vec(type, dst, fdst);
-         read_vec(type, con, fcon);
-
-         for(j = 0; j < type.length; j += 4)
-            compute_blend_ref(blend, fsrc + j, fdst + j, fcon + j, fref + j);
-
-         write_vec(type, ref, fref);
+      if(mode == AoS) {
+         uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         uint8_t con[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         uint8_t res[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         int64_t start_counter = 0;
+         int64_t end_counter = 0;
+
+         random_vec(type, src);
+         random_vec(type, dst);
+         random_vec(type, con);
+
+         {
+            double fsrc[LP_MAX_VECTOR_LENGTH];
+            double fdst[LP_MAX_VECTOR_LENGTH];
+            double fcon[LP_MAX_VECTOR_LENGTH];
+            double fref[LP_MAX_VECTOR_LENGTH];
+
+            read_vec(type, src, fsrc);
+            read_vec(type, dst, fdst);
+            read_vec(type, con, fcon);
+
+            for(j = 0; j < type.length; j += 4)
+               compute_blend_ref(blend, fsrc + j, fdst + j, fcon + j, fref + j);
+
+            write_vec(type, ref, fref);
+         }
+
+         start_counter = rdtsc();
+         blend_test_ptr(src, dst, con, res);
+         end_counter = rdtsc();
+
+         cycles[i] = end_counter - start_counter;
+
+         if(!compare_vec(type, res, ref)) {
+            success = FALSE;
+
+            if(verbose < 1)
+               dump_blend_type(stderr, blend, mode, type);
+            fprintf(stderr, "MISMATCH\n");
+
+            fprintf(stderr, "  Src: ");
+            dump_vec(stderr, type, src);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Dst: ");
+            dump_vec(stderr, type, dst);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Con: ");
+            dump_vec(stderr, type, con);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Res: ");
+            dump_vec(stderr, type, res);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Ref: ");
+            dump_vec(stderr, type, ref);
+            fprintf(stderr, "\n");
+         }
       }
 
-      start_counter = rdtsc();
-      blend_test_ptr(src, dst, con, res);
-      end_counter = rdtsc();
+      if(mode == SoA) {
+         const unsigned stride = type.length*type.width/8;
+         uint8_t src[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         uint8_t dst[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         uint8_t con[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         uint8_t res[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         uint8_t ref[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         int64_t start_counter = 0;
+         int64_t end_counter = 0;
+         boolean mismatch;
+
+         for(j = 0; j < 4; ++j) {
+            random_vec(type, src + j*stride);
+            random_vec(type, dst + j*stride);
+            random_vec(type, con + j*stride);
+         }
 
-      cycles[i] = end_counter - start_counter;
+         {
+            double fsrc[4];
+            double fdst[4];
+            double fcon[4];
+            double fref[4];
+            unsigned k;
+
+            for(k = 0; k < type.length; ++k) {
+               for(j = 0; j < 4; ++j) {
+                  fsrc[j] = read_elem(type, src + j*stride, k);
+                  fdst[j] = read_elem(type, dst + j*stride, k);
+                  fcon[j] = read_elem(type, con + j*stride, k);
+               }
 
-      success = compare_vec(type, res, ref);
+               compute_blend_ref(blend, fsrc, fdst, fcon, fref);
 
-      if (!success) {
-         if(verbose < 1)
-            dump_blend_type(stderr, blend, type);
-         fprintf(stderr, "MISMATCH\n");
+               for(j = 0; j < 4; ++j)
+                  write_elem(type, ref + j*stride, k, fref[j]);
+            }
+         }
+
+         start_counter = rdtsc();
+         blend_test_ptr(src, dst, con, res);
+         end_counter = rdtsc();
+
+         cycles[i] = end_counter - start_counter;
+
+         mismatch = FALSE;
+         for (j = 0; j < 4; ++j)
+            if(!compare_vec(type, res + j*stride, ref + j*stride))
+               mismatch = TRUE;
 
-         fprintf(stderr, "  Src: ");
-         dump_vec(stderr, type, src);
-         fprintf(stderr, "\n");
+         if (mismatch) {
+            success = FALSE;
 
-         fprintf(stderr, "  Dst: ");
-         dump_vec(stderr, type, dst);
-         fprintf(stderr, "\n");
+            if(verbose < 1)
+               dump_blend_type(stderr, blend, mode, type);
+            fprintf(stderr, "MISMATCH\n");
+            for(j = 0; j < 4; ++j) {
+               char channel = "RGBA"[j];
+               fprintf(stderr, "  Src%c: ", channel);
+               dump_vec(stderr, type, src + j*stride);
+               fprintf(stderr, "\n");
 
-         fprintf(stderr, "  Con: ");
-         dump_vec(stderr, type, con);
-         fprintf(stderr, "\n");
+               fprintf(stderr, "  Dst%c: ", channel);
+               dump_vec(stderr, type, dst + j*stride);
+               fprintf(stderr, "\n");
 
-         fprintf(stderr, "  Res: ");
-         dump_vec(stderr, type, res);
-         fprintf(stderr, "\n");
+               fprintf(stderr, "  Con%c: ", channel);
+               dump_vec(stderr, type, con + j*stride);
+               fprintf(stderr, "\n");
 
-         fprintf(stderr, "  Ref: ");
-         dump_vec(stderr, type, ref);
-         fprintf(stderr, "\n");
+               fprintf(stderr, "  Res%c: ", channel);
+               dump_vec(stderr, type, res + j*stride);
+               fprintf(stderr, "\n");
+
+               fprintf(stderr, "  Ref%c: ", channel);
+               dump_vec(stderr, type, ref + j*stride);
+               fprintf(stderr, "\n");
+            }
+         }
       }
    }
 
@@ -569,7 +705,7 @@ test_one(unsigned verbose,
    }
 
    if(fp)
-      write_tsv_row(fp, blend, type, cycles_avg, success);
+      write_tsv_row(fp, blend, mode, type, cycles_avg, success);
 
    if (!success) {
       if(verbose < 2)
@@ -650,6 +786,7 @@ test_all(unsigned verbose, FILE *fp)
    const unsigned *alpha_src_factor;
    const unsigned *alpha_dst_factor;
    struct pipe_blend_state blend;
+   enum vector_mode mode;
    const union lp_type *type;
    bool success = TRUE;
 
@@ -659,24 +796,26 @@ test_all(unsigned verbose, FILE *fp)
             for(rgb_dst_factor = blend_factors; rgb_dst_factor <= rgb_src_factor; ++rgb_dst_factor) {
                for(alpha_src_factor = blend_factors; alpha_src_factor < &blend_factors[num_factors]; ++alpha_src_factor) {
                   for(alpha_dst_factor = blend_factors; alpha_dst_factor <= alpha_src_factor; ++alpha_dst_factor) {
-                     for(type = blend_types; type < &blend_types[num_types]; ++type) {
-
-                        if(*rgb_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
-                           *alpha_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE)
-                           continue;
-
-                        memset(&blend, 0, sizeof blend);
-                        blend.blend_enable      = 1;
-                        blend.rgb_func          = *rgb_func;
-                        blend.rgb_src_factor    = *rgb_src_factor;
-                        blend.rgb_dst_factor    = *rgb_dst_factor;
-                        blend.alpha_func        = *alpha_func;
-                        blend.alpha_src_factor  = *alpha_src_factor;
-                        blend.alpha_dst_factor  = *alpha_dst_factor;
-
-                        if(!test_one(verbose, fp, &blend, *type))
-                          success = FALSE;
-
+                     for(mode = 0; mode < 2; ++mode) {
+                        for(type = blend_types; type < &blend_types[num_types]; ++type) {
+
+                           if(*rgb_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+                              *alpha_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE)
+                              continue;
+
+                           memset(&blend, 0, sizeof blend);
+                           blend.blend_enable      = 1;
+                           blend.rgb_func          = *rgb_func;
+                           blend.rgb_src_factor    = *rgb_src_factor;
+                           blend.rgb_dst_factor    = *rgb_dst_factor;
+                           blend.alpha_func        = *alpha_func;
+                           blend.alpha_src_factor  = *alpha_src_factor;
+                           blend.alpha_dst_factor  = *alpha_dst_factor;
+
+                           if(!test_one(verbose, fp, &blend, mode, *type))
+                             success = FALSE;
+
+                        }
                      }
                   }
                }
@@ -699,6 +838,7 @@ test_some(unsigned verbose, FILE *fp, unsigned long n)
    const unsigned *alpha_src_factor;
    const unsigned *alpha_dst_factor;
    struct pipe_blend_state blend;
+   enum vector_mode mode;
    const union lp_type *type;
    unsigned long i;
    bool success = TRUE;
@@ -717,20 +857,21 @@ test_some(unsigned verbose, FILE *fp, unsigned long n)
          alpha_dst_factor = &blend_factors[random() % num_factors];
       } while(*alpha_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE);
 
-      for(type = blend_types; type < &blend_types[num_types]; ++type) {
+      mode = random() & 1;
 
-         memset(&blend, 0, sizeof blend);
-         blend.blend_enable      = 1;
-         blend.rgb_func          = *rgb_func;
-         blend.rgb_src_factor    = *rgb_src_factor;
-         blend.rgb_dst_factor    = *rgb_dst_factor;
-         blend.alpha_func        = *alpha_func;
-         blend.alpha_src_factor  = *alpha_src_factor;
-         blend.alpha_dst_factor  = *alpha_dst_factor;
+      type = &blend_types[random() % num_types];
 
-         if(!test_one(verbose, fp, &blend, *type))
-           success = FALSE;
-      }
+      memset(&blend, 0, sizeof blend);
+      blend.blend_enable      = 1;
+      blend.rgb_func          = *rgb_func;
+      blend.rgb_src_factor    = *rgb_src_factor;
+      blend.rgb_dst_factor    = *rgb_dst_factor;
+      blend.alpha_func        = *alpha_func;
+      blend.alpha_src_factor  = *alpha_src_factor;
+      blend.alpha_dst_factor  = *alpha_dst_factor;
+
+      if(!test_one(verbose, fp, &blend, mode, *type))
+        success = FALSE;
    }
 
    return success;