From 0a7824862eb753878fa79b153b2a111884ff1197 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 15 Sep 2010 17:04:26 -0600
Subject: [PATCH] gallivm: expand AoS sampling to cover all filtering modes

...and all texture targets (1D/2D/3D/CUBE).
---
 src/gallium/auxiliary/Makefile                    |    1 +
 src/gallium/auxiliary/SConscript                  |    1 +
 src/gallium/auxiliary/gallivm/lp_bld_sample.c     |  513 ++++++++-
 src/gallium/auxiliary/gallivm/lp_bld_sample.h     |  164 ++-
 src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c | 1145 +++++++++++++++++++++
 src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h |   65 ++
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c | 1059 +------------------
 7 files changed, 1919 insertions(+), 1029 deletions(-)
 create mode 100644 src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
 create mode 100644 src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h

diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile
index 5388f4e..2a69294 100644
--- a/src/gallium/auxiliary/Makefile
+++ b/src/gallium/auxiliary/Makefile
@@ -168,6 +168,7 @@ GALLIVM_SOURCES = \
         gallivm/lp_bld_printf.c \
         gallivm/lp_bld_quad.c \
         gallivm/lp_bld_sample.c \
+        gallivm/lp_bld_sample_aos.c \
         gallivm/lp_bld_sample_soa.c \
         gallivm/lp_bld_struct.c \
         gallivm/lp_bld_swizzle.c \
diff --git a/src/gallium/auxiliary/SConscript b/src/gallium/auxiliary/SConscript
index ba8be2e..cea2d7d 100644
--- a/src/gallium/auxiliary/SConscript
+++ b/src/gallium/auxiliary/SConscript
@@ -219,6 +219,7 @@ if env['llvm']:
     'gallivm/lp_bld_printf.c',
     'gallivm/lp_bld_quad.c',
     'gallivm/lp_bld_sample.c',
+    'gallivm/lp_bld_sample_aos.c',
     'gallivm/lp_bld_sample_soa.c',
     'gallivm/lp_bld_struct.c',
     'gallivm/lp_bld_swizzle.c',
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 259b114..e89ee7c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -36,11 +36,13 @@
 #include "pipe/p_state.h"
 #include "util/u_format.h"
 #include "util/u_math.h"
-#include "lp_bld_debug.h"
-#include "lp_bld_const.h"
 #include "lp_bld_arit.h"
-#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_flow.h"
 #include "lp_bld_sample.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_type.h"
 
 
 /**
@@ -124,6 +126,511 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 
 
 /**
+ * Generate code to compute texture level of detail (lambda).
+ * \param ddx  partial derivatives of (s, t, r, q) with respect to X
+ * \param ddy  partial derivatives of (s, t, r, q) with respect to Y
+ * \param lod_bias  optional float vector with the shader lod bias
+ * \param explicit_lod  optional float vector with the explicit lod
+ * \param width  scalar int texture width
+ * \param height  scalar int texture height
+ * \param depth  scalar int texture depth
+ *
+ * XXX: The resulting lod is scalar, so ignore all but the first element of
+ * derivatives, lod_bias, etc that are passed by the shader.
+ */
+LLVMValueRef
+lp_build_lod_selector(struct lp_build_sample_context *bld,
+                      const LLVMValueRef ddx[4],
+                      const LLVMValueRef ddy[4],
+                      LLVMValueRef lod_bias, /* optional */
+                      LLVMValueRef explicit_lod, /* optional */
+                      LLVMValueRef width,
+                      LLVMValueRef height,
+                      LLVMValueRef depth)
+
+{
+   if (bld->static_state->min_lod == bld->static_state->max_lod) {
+      /* User is forcing sampling from a particular mipmap level.
+       * This is hit during mipmap generation.
+       */
+      return LLVMConstReal(LLVMFloatType(), bld->static_state->min_lod);
+   }
+   else {
+      struct lp_build_context *float_bld = &bld->float_bld;
+      LLVMValueRef sampler_lod_bias = LLVMConstReal(LLVMFloatType(),
+                                                    bld->static_state->lod_bias);
+      LLVMValueRef min_lod = LLVMConstReal(LLVMFloatType(),
+                                           bld->static_state->min_lod);
+      LLVMValueRef max_lod = LLVMConstReal(LLVMFloatType(),
+                                           bld->static_state->max_lod);
+      LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+      LLVMValueRef lod;
+
+      if (explicit_lod) {
+         lod = LLVMBuildExtractElement(bld->builder, explicit_lod,
+                                       index0, "");
+      }
+      else {
+         const int dims = texture_dims(bld->static_state->target);
+         LLVMValueRef dsdx, dsdy;
+         LLVMValueRef dtdx = NULL, dtdy = NULL, drdx = NULL, drdy = NULL;
+         LLVMValueRef rho;
+
+         dsdx = LLVMBuildExtractElement(bld->builder, ddx[0], index0, "dsdx");
+         dsdx = lp_build_abs(float_bld, dsdx);
+         dsdy = LLVMBuildExtractElement(bld->builder, ddy[0], index0, "dsdy");
+         dsdy = lp_build_abs(float_bld, dsdy);
+         if (dims > 1) {
+            dtdx = LLVMBuildExtractElement(bld->builder, ddx[1], index0, "dtdx");
+            dtdx = lp_build_abs(float_bld, dtdx);
+            dtdy = LLVMBuildExtractElement(bld->builder, ddy[1], index0, "dtdy");
+            dtdy = lp_build_abs(float_bld, dtdy);
+            if (dims > 2) {
+               drdx = LLVMBuildExtractElement(bld->builder, ddx[2], index0, "drdx");
+               drdx = lp_build_abs(float_bld, drdx);
+               drdy = LLVMBuildExtractElement(bld->builder, ddy[2], index0, "drdy");
+               drdy = lp_build_abs(float_bld, drdy);
+            }
+         }
+
+         /* Compute rho = max of all partial derivatives scaled by texture size.
+          * XXX this could be vectorized somewhat
+          */
+         rho = LLVMBuildFMul(bld->builder,
+                            lp_build_max(float_bld, dsdx, dsdy),
+                            lp_build_int_to_float(float_bld, width), "");
+         if (dims > 1) {
+            LLVMValueRef max;
+            max = LLVMBuildFMul(bld->builder,
+                               lp_build_max(float_bld, dtdx, dtdy),
+                               lp_build_int_to_float(float_bld, height), "");
+            rho = lp_build_max(float_bld, rho, max);
+            if (dims > 2) {
+               max = LLVMBuildFMul(bld->builder,
+                                  lp_build_max(float_bld, drdx, drdy),
+                                  lp_build_int_to_float(float_bld, depth), "");
+               rho = lp_build_max(float_bld, rho, max);
+            }
+         }
+
+         /* compute lod = log2(rho) */
+         lod = lp_build_log2(float_bld, rho);
+
+         /* add shader lod bias */
+         if (lod_bias) {
+            lod_bias = LLVMBuildExtractElement(bld->builder, lod_bias,
+                                               index0, "");
+            lod = LLVMBuildFAdd(bld->builder, lod, lod_bias, "shader_lod_bias");
+         }
+      }
+
+      /* add sampler lod bias */
+      lod = LLVMBuildFAdd(bld->builder, lod, sampler_lod_bias, "sampler_lod_bias");
+
+      /* clamp lod */
+      lod = lp_build_clamp(float_bld, lod, min_lod, max_lod);
+
+      return lod;
+   }
+}
+
+
+/**
+ * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
+ * mipmap level index.
+ * Note: this is all scalar code.
+ * \param lod  scalar float texture level of detail
+ * \param level_out  returns integer 
+ */
+void
+lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
+                           unsigned unit,
+                           LLVMValueRef lod,
+                           LLVMValueRef *level_out)
+{
+   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *int_bld = &bld->int_bld;
+   LLVMValueRef last_level, level;
+
+   LLVMValueRef zero = LLVMConstInt(LLVMInt32Type(), 0, 0);
+
+   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
+                                               bld->builder, unit);
+
+   /* convert float lod to integer */
+   level = lp_build_iround(float_bld, lod);
+
+   /* clamp level to legal range of levels */
+   *level_out = lp_build_clamp(int_bld, level, zero, last_level);
+}
+
+
+/**
+ * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to
+ * two (adjacent) mipmap level indexes.  Later, we'll sample from those
+ * two mipmap levels and interpolate between them.
+ */
+void
+lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
+                           unsigned unit,
+                           LLVMValueRef lod,
+                           LLVMValueRef *level0_out,
+                           LLVMValueRef *level1_out,
+                           LLVMValueRef *weight_out)
+{
+   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *int_bld = &bld->int_bld;
+   LLVMValueRef last_level, level;
+
+   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
+                                               bld->builder, unit);
+
+   /* convert float lod to integer */
+   level = lp_build_ifloor(float_bld, lod);
+
+   /* compute level 0 and clamp to legal range of levels */
+   *level0_out = lp_build_clamp(int_bld, level,
+                                int_bld->zero,
+                                last_level);
+   /* compute level 1 and clamp to legal range of levels */
+   level = lp_build_add(int_bld, level, int_bld->one);
+   *level1_out = lp_build_clamp(int_bld, level,
+                                int_bld->zero,
+                                last_level);
+
+   *weight_out = lp_build_fract(float_bld, lod);
+}
+
+
+LLVMValueRef
+lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
+                          LLVMValueRef data_array, LLVMValueRef level)
+{
+   LLVMValueRef indexes[2], data_ptr;
+   indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   indexes[1] = level;
+   data_ptr = LLVMBuildGEP(bld->builder, data_array, indexes, 2, "");
+   data_ptr = LLVMBuildLoad(bld->builder, data_ptr, "");
+   return data_ptr;
+}
+
+
+LLVMValueRef
+lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
+                                LLVMValueRef data_array, int level)
+{
+   LLVMValueRef lvl = LLVMConstInt(LLVMInt32Type(), level, 0);
+   return lp_build_get_mipmap_level(bld, data_array, lvl);
+}
+
+
+/**
+ * Codegen equivalent for u_minify().
+ * Return max(1, base_size >> level);
+ */
+static LLVMValueRef
+lp_build_minify(struct lp_build_sample_context *bld,
+                LLVMValueRef base_size,
+                LLVMValueRef level)
+{
+   LLVMValueRef size = LLVMBuildLShr(bld->builder, base_size, level, "minify");
+   size = lp_build_max(&bld->int_coord_bld, size, bld->int_coord_bld.one);
+   return size;
+}
+
+
+/**
+ * Dereference stride_array[mipmap_level] array to get a stride.
+ * Return stride as a vector.
+ */
+static LLVMValueRef
+lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
+                              LLVMValueRef stride_array, LLVMValueRef level)
+{
+   LLVMValueRef indexes[2], stride;
+   indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   indexes[1] = level;
+   stride = LLVMBuildGEP(bld->builder, stride_array, indexes, 2, "");
+   stride = LLVMBuildLoad(bld->builder, stride, "");
+   stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride);
+   return stride;
+}
+
+
+/**
+ * When sampling a mipmap, we need to compute the width, height, depth
+ * of the source levels from the level indexes.  This helper function
+ * does that.
+ */
+void
+lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
+                            unsigned dims,
+                            LLVMValueRef width_vec,
+                            LLVMValueRef height_vec,
+                            LLVMValueRef depth_vec,
+                            LLVMValueRef ilevel0,
+                            LLVMValueRef ilevel1,
+                            LLVMValueRef row_stride_array,
+                            LLVMValueRef img_stride_array,
+                            LLVMValueRef *width0_vec,
+                            LLVMValueRef *width1_vec,
+                            LLVMValueRef *height0_vec,
+                            LLVMValueRef *height1_vec,
+                            LLVMValueRef *depth0_vec,
+                            LLVMValueRef *depth1_vec,
+                            LLVMValueRef *row_stride0_vec,
+                            LLVMValueRef *row_stride1_vec,
+                            LLVMValueRef *img_stride0_vec,
+                            LLVMValueRef *img_stride1_vec)
+{
+   const unsigned mip_filter = bld->static_state->min_mip_filter;
+   LLVMValueRef ilevel0_vec, ilevel1_vec;
+
+   ilevel0_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel0);
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR)
+      ilevel1_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel1);
+
+   /*
+    * Compute width, height, depth at mipmap level 'ilevel0'
+    */
+   *width0_vec = lp_build_minify(bld, width_vec, ilevel0_vec);
+   if (dims >= 2) {
+      *height0_vec = lp_build_minify(bld, height_vec, ilevel0_vec);
+      *row_stride0_vec = lp_build_get_level_stride_vec(bld,
+                                                       row_stride_array,
+                                                       ilevel0);
+      if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         *img_stride0_vec = lp_build_get_level_stride_vec(bld,
+                                                          img_stride_array,
+                                                          ilevel0);
+         if (dims == 3) {
+            *depth0_vec = lp_build_minify(bld, depth_vec, ilevel0_vec);
+         }
+      }
+   }
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+      /* compute width, height, depth for second mipmap level at 'ilevel1' */
+      *width1_vec = lp_build_minify(bld, width_vec, ilevel1_vec);
+      if (dims >= 2) {
+         *height1_vec = lp_build_minify(bld, height_vec, ilevel1_vec);
+         *row_stride1_vec = lp_build_get_level_stride_vec(bld,
+                                                          row_stride_array,
+                                                          ilevel1);
+         if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) {
+            *img_stride1_vec = lp_build_get_level_stride_vec(bld,
+                                                             img_stride_array,
+                                                             ilevel1);
+            if (dims == 3) {
+               *depth1_vec = lp_build_minify(bld, depth_vec, ilevel1_vec);
+            }
+         }
+      }
+   }
+}
+
+
+
+/** Helper used by lp_build_cube_lookup() */
+static LLVMValueRef
+lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
+{
+   /* ima = -0.5 / abs(coord); */
+   LLVMValueRef negHalf = lp_build_const_vec(coord_bld->type, -0.5);
+   LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
+   LLVMValueRef ima = lp_build_div(coord_bld, negHalf, absCoord);
+   return ima;
+}
+
+
+/**
+ * Helper used by lp_build_cube_lookup()
+ * \param sign  scalar +1 or -1
+ * \param coord  float vector
+ * \param ima  float vector
+ */
+static LLVMValueRef
+lp_build_cube_coord(struct lp_build_context *coord_bld,
+                    LLVMValueRef sign, int negate_coord,
+                    LLVMValueRef coord, LLVMValueRef ima)
+{
+   /* return negate(coord) * ima * sign + 0.5; */
+   LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5);
+   LLVMValueRef res;
+
+   assert(negate_coord == +1 || negate_coord == -1);
+
+   if (negate_coord == -1) {
+      coord = lp_build_negate(coord_bld, coord);
+   }
+
+   res = lp_build_mul(coord_bld, coord, ima);
+   if (sign) {
+      sign = lp_build_broadcast_scalar(coord_bld, sign);
+      res = lp_build_mul(coord_bld, res, sign);
+   }
+   res = lp_build_add(coord_bld, res, half);
+
+   return res;
+}
+
+
+/** Helper used by lp_build_cube_lookup()
+ * Return (major_coord >= 0) ? pos_face : neg_face;
+ */
+static LLVMValueRef
+lp_build_cube_face(struct lp_build_sample_context *bld,
+                   LLVMValueRef major_coord,
+                   unsigned pos_face, unsigned neg_face)
+{
+   LLVMValueRef cmp = LLVMBuildFCmp(bld->builder, LLVMRealUGE,
+                                    major_coord,
+                                    bld->float_bld.zero, "");
+   LLVMValueRef pos = LLVMConstInt(LLVMInt32Type(), pos_face, 0);
+   LLVMValueRef neg = LLVMConstInt(LLVMInt32Type(), neg_face, 0);
+   LLVMValueRef res = LLVMBuildSelect(bld->builder, cmp, pos, neg, "");
+   return res;
+}
+
+
+
+/**
+ * Generate code to do cube face selection and compute per-face texcoords.
+ */
+void
+lp_build_cube_lookup(struct lp_build_sample_context *bld,
+                     LLVMValueRef s,
+                     LLVMValueRef t,
+                     LLVMValueRef r,
+                     LLVMValueRef *face,
+                     LLVMValueRef *face_s,
+                     LLVMValueRef *face_t)
+{
+   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   LLVMValueRef rx, ry, rz;
+   LLVMValueRef arx, ary, arz;
+   LLVMValueRef c25 = LLVMConstReal(LLVMFloatType(), 0.25);
+   LLVMValueRef arx_ge_ary, arx_ge_arz;
+   LLVMValueRef ary_ge_arx, ary_ge_arz;
+   LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
+   LLVMValueRef rx_pos, ry_pos, rz_pos;
+
+   assert(bld->coord_bld.type.length == 4);
+
+   /*
+    * Use the average of the four pixel's texcoords to choose the face.
+    */
+   rx = lp_build_mul(float_bld, c25,
+                     lp_build_sum_vector(&bld->coord_bld, s));
+   ry = lp_build_mul(float_bld, c25,
+                     lp_build_sum_vector(&bld->coord_bld, t));
+   rz = lp_build_mul(float_bld, c25,
+                     lp_build_sum_vector(&bld->coord_bld, r));
+
+   arx = lp_build_abs(float_bld, rx);
+   ary = lp_build_abs(float_bld, ry);
+   arz = lp_build_abs(float_bld, rz);
+
+   /*
+    * Compare sign/magnitude of rx,ry,rz to determine face
+    */
+   arx_ge_ary = LLVMBuildFCmp(bld->builder, LLVMRealUGE, arx, ary, "");
+   arx_ge_arz = LLVMBuildFCmp(bld->builder, LLVMRealUGE, arx, arz, "");
+   ary_ge_arx = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ary, arx, "");
+   ary_ge_arz = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ary, arz, "");
+
+   arx_ge_ary_arz = LLVMBuildAnd(bld->builder, arx_ge_ary, arx_ge_arz, "");
+   ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, "");
+
+   rx_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rx, float_bld->zero, "");
+   ry_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ry, float_bld->zero, "");
+   rz_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rz, float_bld->zero, "");
+
+   {
+      struct lp_build_flow_context *flow_ctx;
+      struct lp_build_if_state if_ctx;
+
+      flow_ctx = lp_build_flow_create(bld->builder);
+      lp_build_flow_scope_begin(flow_ctx);
+
+      *face_s = bld->coord_bld.undef;
+      *face_t = bld->coord_bld.undef;
+      *face = bld->int_bld.undef;
+
+      lp_build_name(*face_s, "face_s");
+      lp_build_name(*face_t, "face_t");
+      lp_build_name(*face, "face");
+
+      lp_build_flow_scope_declare(flow_ctx, face_s);
+      lp_build_flow_scope_declare(flow_ctx, face_t);
+      lp_build_flow_scope_declare(flow_ctx, face);
+
+      lp_build_if(&if_ctx, flow_ctx, bld->builder, arx_ge_ary_arz);
+      {
+         /* +/- X face */
+         LLVMValueRef sign = lp_build_sgn(float_bld, rx);
+         LLVMValueRef ima = lp_build_cube_ima(coord_bld, s);
+         *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
+         *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
+         *face = lp_build_cube_face(bld, rx,
+                                    PIPE_TEX_FACE_POS_X,
+                                    PIPE_TEX_FACE_NEG_X);
+      }
+      lp_build_else(&if_ctx);
+      {
+         struct lp_build_flow_context *flow_ctx2;
+         struct lp_build_if_state if_ctx2;
+
+         LLVMValueRef face_s2 = bld->coord_bld.undef;
+         LLVMValueRef face_t2 = bld->coord_bld.undef;
+         LLVMValueRef face2 = bld->int_bld.undef;
+
+         flow_ctx2 = lp_build_flow_create(bld->builder);
+         lp_build_flow_scope_begin(flow_ctx2);
+         lp_build_flow_scope_declare(flow_ctx2, &face_s2);
+         lp_build_flow_scope_declare(flow_ctx2, &face_t2);
+         lp_build_flow_scope_declare(flow_ctx2, &face2);
+
+         ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, "");
+
+         lp_build_if(&if_ctx2, flow_ctx2, bld->builder, ary_ge_arx_arz);
+         {
+            /* +/- Y face */
+            LLVMValueRef sign = lp_build_sgn(float_bld, ry);
+            LLVMValueRef ima = lp_build_cube_ima(coord_bld, t);
+            face_s2 = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
+            face_t2 = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
+            face2 = lp_build_cube_face(bld, ry,
+                                       PIPE_TEX_FACE_POS_Y,
+                                       PIPE_TEX_FACE_NEG_Y);
+         }
+         lp_build_else(&if_ctx2);
+         {
+            /* +/- Z face */
+            LLVMValueRef sign = lp_build_sgn(float_bld, rz);
+            LLVMValueRef ima = lp_build_cube_ima(coord_bld, r);
+            face_s2 = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
+            face_t2 = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
+            face2 = lp_build_cube_face(bld, rz,
+                                       PIPE_TEX_FACE_POS_Z,
+                                       PIPE_TEX_FACE_NEG_Z);
+         }
+         lp_build_endif(&if_ctx2);
+         lp_build_flow_scope_end(flow_ctx2);
+         lp_build_flow_destroy(flow_ctx2);
+         *face_s = face_s2;
+         *face_t = face_t2;
+         *face = face2;
+      }
+
+      lp_build_endif(&if_ctx);
+      lp_build_flow_scope_end(flow_ctx);
+      lp_build_flow_destroy(flow_ctx);
+   }
+}
+
+
+/**
  * Compute the partial offset of a pixel block along an arbitrary axis.
  *
  * @param coord   coordinate in pixels
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index caafc4e..ff72b8e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -37,8 +37,11 @@
 
 
 #include "pipe/p_format.h"
-
+#include "util/u_debug.h"
 #include "gallivm/lp_bld.h"
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_swizzle.h"
+
 
 struct pipe_resource;
 struct pipe_sampler_view;
@@ -81,6 +84,10 @@ struct lp_sampler_static_state
    unsigned normalized_coords:1;
    float lod_bias, min_lod, max_lod;
    float border_color[4];
+
+   /* Aero hacks */
+   unsigned force_nearest_s:1;
+   unsigned force_nearest_t:1;
 };
 
 
@@ -140,6 +147,96 @@ struct lp_sampler_dynamic_state
 
 
 /**
+ * Keep all information for sampling code generation in a single place.
+ */
+struct lp_build_sample_context
+{
+   LLVMBuilderRef builder;
+
+   const struct lp_sampler_static_state *static_state;
+
+   struct lp_sampler_dynamic_state *dynamic_state;
+
+   const struct util_format_description *format_desc;
+
+   /** regular scalar float type */
+   struct lp_type float_type;
+   struct lp_build_context float_bld;
+
+   /** regular scalar float type */
+   struct lp_type int_type;
+   struct lp_build_context int_bld;
+
+   /** Incoming coordinates type and build context */
+   struct lp_type coord_type;
+   struct lp_build_context coord_bld;
+
+   /** Unsigned integer coordinates */
+   struct lp_type uint_coord_type;
+   struct lp_build_context uint_coord_bld;
+
+   /** Signed integer coordinates */
+   struct lp_type int_coord_type;
+   struct lp_build_context int_coord_bld;
+
+   /** Output texels type and build context */
+   struct lp_type texel_type;
+   struct lp_build_context texel_bld;
+};
+
+
+
+/**
+ * We only support a few wrap modes in lp_build_sample_wrap_linear_int() at
+ * this time.  Return whether the given mode is supported by that function.
+ */
+static INLINE boolean
+lp_is_simple_wrap_mode(unsigned mode)
+{
+   switch (mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return TRUE;
+   default:
+      return FALSE;
+   }
+}
+
+
+static INLINE void
+apply_sampler_swizzle(struct lp_build_sample_context *bld,
+                      LLVMValueRef *texel)
+{
+   unsigned char swizzles[4];
+
+   swizzles[0] = bld->static_state->swizzle_r;
+   swizzles[1] = bld->static_state->swizzle_g;
+   swizzles[2] = bld->static_state->swizzle_b;
+   swizzles[3] = bld->static_state->swizzle_a;
+
+   lp_build_swizzle_soa_inplace(&bld->texel_bld, texel, swizzles);
+}
+
+
+static INLINE int
+texture_dims(enum pipe_texture_target tex)
+{
+   switch (tex) {
+   case PIPE_TEXTURE_1D:
+      return 1;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_CUBE:
+      return 2;
+   case PIPE_TEXTURE_3D:
+      return 3;
+   default:
+      assert(0 && "bad texture target in texture_dims()");
+      return 2;
+   }
+}
+
+
+/**
  * Derive the sampler static state.
  */
 void
@@ -148,6 +245,71 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
                         const struct pipe_sampler_state *sampler);
 
 
+LLVMValueRef
+lp_build_lod_selector(struct lp_build_sample_context *bld,
+                      const LLVMValueRef ddx[4],
+                      const LLVMValueRef ddy[4],
+                      LLVMValueRef lod_bias, /* optional */
+                      LLVMValueRef explicit_lod, /* optional */
+                      LLVMValueRef width,
+                      LLVMValueRef height,
+                      LLVMValueRef depth);
+
+void
+lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
+                           unsigned unit,
+                           LLVMValueRef lod,
+                           LLVMValueRef *level_out);
+
+void
+lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
+                           unsigned unit,
+                           LLVMValueRef lod,
+                           LLVMValueRef *level0_out,
+                           LLVMValueRef *level1_out,
+                           LLVMValueRef *weight_out);
+
+LLVMValueRef
+lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
+                          LLVMValueRef data_array, LLVMValueRef level);
+
+LLVMValueRef
+lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
+                                LLVMValueRef data_array, int level);
+
+
+void
+lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
+                            unsigned dims,
+                            LLVMValueRef width_vec,
+                            LLVMValueRef height_vec,
+                            LLVMValueRef depth_vec,
+                            LLVMValueRef ilevel0,
+                            LLVMValueRef ilevel1,
+                            LLVMValueRef row_stride_array,
+                            LLVMValueRef img_stride_array,
+                            LLVMValueRef *width0_vec,
+                            LLVMValueRef *width1_vec,
+                            LLVMValueRef *height0_vec,
+                            LLVMValueRef *height1_vec,
+                            LLVMValueRef *depth0_vec,
+                            LLVMValueRef *depth1_vec,
+                            LLVMValueRef *row_stride0_vec,
+                            LLVMValueRef *row_stride1_vec,
+                            LLVMValueRef *img_stride0_vec,
+                            LLVMValueRef *img_stride1_vec);
+
+
+void
+lp_build_cube_lookup(struct lp_build_sample_context *bld,
+                     LLVMValueRef s,
+                     LLVMValueRef t,
+                     LLVMValueRef r,
+                     LLVMValueRef *face,
+                     LLVMValueRef *face_s,
+                     LLVMValueRef *face_t);
+
+
 void
 lp_build_sample_partial_offset(struct lp_build_context *bld,
                                unsigned block_length,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
new file mode 100644
index 0000000..a9a4e7b
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -0,0 +1,1145 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Texture sampling -- SoA.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ * @author Brian Paul <brianp@vmware.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+#include "util/u_dump.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_format.h"
+#include "util/u_cpu_detect.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_pack.h"
+#include "lp_bld_flow.h"
+#include "lp_bld_gather.h"
+#include "lp_bld_format.h"
+#include "lp_bld_sample.h"
+#include "lp_bld_sample_aos.h"
+#include "lp_bld_quad.h"
+
+
+/**
+ * Build LLVM code for texture coord wrapping, for nearest filtering,
+ * for scaled integer texcoords.
+ * \param block_length  is the length of the pixel block along the
+ *                      coordinate axis
+ * \param coord  the incoming texcoord (s,t,r or q) scaled to the texture size
+ * \param length  the texture size along one dimension
+ * \param stride  pixel stride along the coordinate axis (in bytes)
+ * \param is_pot  if TRUE, length is a power of two
+ * \param wrap_mode  one of PIPE_TEX_WRAP_x
+ * \param out_offset  byte offset for the wrapped coordinate
+ * \param out_i  resulting sub-block pixel coordinate for coord0
+ */
+static void
+lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
+                                 unsigned block_length,
+                                 LLVMValueRef coord,
+                                 LLVMValueRef length,
+                                 LLVMValueRef stride,
+                                 boolean is_pot,
+                                 unsigned wrap_mode,
+                                 LLVMValueRef *out_offset,
+                                 LLVMValueRef *out_i)
+{
+   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef length_minus_one;
+
+   length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
+
+   switch(wrap_mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      if(is_pot)
+         coord = LLVMBuildAnd(bld->builder, coord, length_minus_one, "");
+      else
+         /* Signed remainder won't give the right results for negative
+          * dividends but unsigned remainder does.*/
+         coord = LLVMBuildURem(bld->builder, coord, length, "");
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
+      coord = lp_build_min(int_coord_bld, coord, length_minus_one);
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+   default:
+      assert(0);
+   }
+
+   lp_build_sample_partial_offset(uint_coord_bld, block_length, coord, stride,
+                                  out_offset, out_i);
+}
+
+
+/**
+ * Build LLVM code for texture coord wrapping, for linear filtering,
+ * for scaled integer texcoords.
+ * \param block_length  is the length of the pixel block along the
+ *                      coordinate axis
+ * \param coord0  the incoming texcoord (s,t,r or q) scaled to the texture size
+ * \param length  the texture size along one dimension
+ * \param stride  pixel stride along the coordinate axis (in bytes)
+ * \param is_pot  if TRUE, length is a power of two
+ * \param wrap_mode  one of PIPE_TEX_WRAP_x
+ * \param offset0  resulting relative offset for coord0
+ * \param offset1  resulting relative offset for coord0 + 1
+ * \param i0  resulting sub-block pixel coordinate for coord0
+ * \param i1  resulting sub-block pixel coordinate for coord0 + 1
+ */
+static void
+lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
+                                unsigned block_length,
+                                LLVMValueRef coord0,
+                                LLVMValueRef length,
+                                LLVMValueRef stride,
+                                boolean is_pot,
+                                unsigned wrap_mode,
+                                LLVMValueRef *offset0,
+                                LLVMValueRef *offset1,
+                                LLVMValueRef *i0,
+                                LLVMValueRef *i1)
+{
+   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef length_minus_one;
+   LLVMValueRef lmask, umask, mask;
+
+   if (block_length != 1) {
+      /*
+       * If the pixel block covers more than one pixel then there is no easy
+       * way to calculate offset1 relative to offset0. Instead, compute them
+       * independently.
+       */
+
+      LLVMValueRef coord1;
+
+      lp_build_sample_wrap_nearest_int(bld,
+                                       block_length,
+                                       coord0,
+                                       length,
+                                       stride,
+                                       is_pot,
+                                       wrap_mode,
+                                       offset0, i0);
+
+      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+
+      lp_build_sample_wrap_nearest_int(bld,
+                                       block_length,
+                                       coord1,
+                                       length,
+                                       stride,
+                                       is_pot,
+                                       wrap_mode,
+                                       offset1, i1);
+
+      return;
+   }
+
+   /*
+    * Scalar pixels -- try to compute offset0 and offset1 with a single stride
+    * multiplication.
+    */
+
+   *i0 = uint_coord_bld->zero;
+   *i1 = uint_coord_bld->zero;
+
+   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
+
+   switch(wrap_mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      if (is_pot) {
+         coord0 = LLVMBuildAnd(bld->builder, coord0, length_minus_one, "");
+      }
+      else {
+         /* Signed remainder won't give the right results for negative
+          * dividends but unsigned remainder does.*/
+         coord0 = LLVMBuildURem(bld->builder, coord0, length, "");
+      }
+
+      mask = lp_build_compare(bld->builder, int_coord_bld->type,
+                              PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
+
+      *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
+      *offset1 = LLVMBuildAnd(bld->builder,
+                              lp_build_add(uint_coord_bld, *offset0, stride),
+                              mask, "");
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      lmask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type,
+                               PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
+      umask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type,
+                               PIPE_FUNC_LESS, coord0, length_minus_one);
+
+      coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero);
+      coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one);
+
+      mask = LLVMBuildAnd(bld->builder, lmask, umask, "");
+
+      *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
+      *offset1 = lp_build_add(uint_coord_bld,
+                              *offset0,
+                              LLVMBuildAnd(bld->builder, stride, mask, ""));
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+   default:
+      assert(0);
+      *offset0 = uint_coord_bld->zero;
+      *offset1 = uint_coord_bld->zero;
+      break;
+   }
+}
+
+
+/**
+ * Sample a single texture image with nearest sampling.
+ * If sampling a cube texture, r = cube face in [0,5].
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ */
+static void
+lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
+                              LLVMValueRef width_vec,
+                              LLVMValueRef height_vec,
+                              LLVMValueRef depth_vec,
+                              LLVMValueRef row_stride_vec,
+                              LLVMValueRef img_stride_vec,
+                              LLVMValueRef data_ptr,
+                              LLVMValueRef s,
+                              LLVMValueRef t,
+                              LLVMValueRef r,
+                              LLVMValueRef *colors_lo,
+                              LLVMValueRef *colors_hi)
+{
+   const int dims = texture_dims(bld->static_state->target);
+   LLVMBuilderRef builder = bld->builder;
+   struct lp_build_context i32, h16, u8n;
+   LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
+   LLVMValueRef i32_c8;
+   LLVMValueRef s_ipart, t_ipart, r_ipart;
+   LLVMValueRef x_stride;
+   LLVMValueRef x_offset, offset;
+   LLVMValueRef x_subcoord, y_subcoord, z_subcoord;
+
+   lp_build_context_init(&i32, builder, lp_type_int_vec(32));
+   lp_build_context_init(&h16, builder, lp_type_ufixed(16));
+   lp_build_context_init(&u8n, builder, lp_type_unorm(8));
+
+   i32_vec_type = lp_build_vec_type(i32.type);
+   h16_vec_type = lp_build_vec_type(h16.type);
+   u8n_vec_type = lp_build_vec_type(u8n.type);
+
+   if (bld->static_state->normalized_coords) {
+      /* s = s * width, t = t * height */
+      LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type);
+      LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width_vec,
+                                              coord_vec_type, "");
+      s = lp_build_mul(&bld->coord_bld, s, fp_width);
+      if (dims >= 2) {
+         LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height_vec,
+                                                  coord_vec_type, "");
+         t = lp_build_mul(&bld->coord_bld, t, fp_height);
+         if (dims >= 3) {
+            LLVMValueRef fp_depth = LLVMBuildSIToFP(bld->builder, depth_vec,
+                                                    coord_vec_type, "");
+            r = lp_build_mul(&bld->coord_bld, r, fp_depth);
+         }
+      }
+   }
+
+   /* scale coords by 256 (8 fractional bits) */
+   s = lp_build_mul_imm(&bld->coord_bld, s, 256);
+   if (dims >= 2)
+      t = lp_build_mul_imm(&bld->coord_bld, t, 256);
+   if (dims >= 3)
+      r = lp_build_mul_imm(&bld->coord_bld, r, 256);
+
+   /* convert float to int */
+   s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
+   if (dims >= 2)
+      t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
+   if (dims >= 3)
+      r = LLVMBuildFPToSI(builder, r, i32_vec_type, "");
+
+   /* compute floor (shift right 8) */
+   i32_c8 = lp_build_const_int_vec(i32.type, 8);
+   s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
+   if (dims >= 2)
+      t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
+   if (dims >= 3)
+      r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
+
+   /* get pixel, row, image strides */
+   x_stride = lp_build_const_vec(bld->uint_coord_bld.type,
+                                 bld->format_desc->block.bits/8);
+
+   /* Do texcoord wrapping, compute texel offset */
+   lp_build_sample_wrap_nearest_int(bld,
+                                    bld->format_desc->block.width,
+                                    s_ipart, width_vec, x_stride,
+                                    bld->static_state->pot_width,
+                                    bld->static_state->wrap_s,
+                                    &x_offset, &x_subcoord);
+   offset = x_offset;
+   if (dims >= 2) {
+      LLVMValueRef y_offset;
+      lp_build_sample_wrap_nearest_int(bld,
+                                       bld->format_desc->block.height,
+                                       t_ipart, height_vec, row_stride_vec,
+                                       bld->static_state->pot_height,
+                                       bld->static_state->wrap_t,
+                                       &y_offset, &y_subcoord);
+      offset = lp_build_add(&bld->uint_coord_bld, offset, y_offset);
+      if (dims >= 3) {
+         LLVMValueRef z_offset;
+         lp_build_sample_wrap_nearest_int(bld,
+                                          1, /* block length (depth) */
+                                          r_ipart, depth_vec, img_stride_vec,
+                                          bld->static_state->pot_height,
+                                          bld->static_state->wrap_r,
+                                          &z_offset, &z_subcoord);
+         offset = lp_build_add(&bld->uint_coord_bld, offset, z_offset);
+      }
+      else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         LLVMValueRef z_offset;
+         /* The r coord is the cube face in [0,5] */
+         z_offset = lp_build_mul(&bld->uint_coord_bld, r, img_stride_vec);
+         offset = lp_build_add(&bld->uint_coord_bld, offset, z_offset);
+      }
+   }
+
+   /*
+    * Fetch the pixels as 4 x 32bit (rgba order might differ):
+    *
+    *   rgba0 rgba1 rgba2 rgba3
+    *
+    * bit cast them into 16 x u8
+    *
+    *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
+    *
+    * unpack them into two 8 x i16:
+    *
+    *   r0 g0 b0 a0 r1 g1 b1 a1
+    *   r2 g2 b2 a2 r3 g3 b3 a3
+    *
+    * The higher 8 bits of the resulting elements will be zero.
+    */
+   {
+      LLVMValueRef rgba8;
+
+      if (util_format_is_rgba8_variant(bld->format_desc)) {
+         /*
+          * Given the format is a rgba8, just read the pixels as is,
+          * without any swizzling. Swizzling will be done later.
+          */
+         rgba8 = lp_build_gather(bld->builder,
+                                 bld->texel_type.length,
+                                 bld->format_desc->block.bits,
+                                 bld->texel_type.width,
+                                 data_ptr, offset);
+
+         rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+      }
+      else {
+         rgba8 = lp_build_fetch_rgba_aos(bld->builder,
+                                         bld->format_desc,
+                                         u8n.type,
+                                         data_ptr, offset,
+                                         x_subcoord,
+                                         y_subcoord);
+      }
+
+      /* Expand one 4*rgba8 to two 2*rgba16 */
+      lp_build_unpack2(builder, u8n.type, h16.type,
+                       rgba8,
+                       colors_lo, colors_hi);
+   }
+}
+
+
+/**
+ * Sample a single texture image with (bi-)(tri-)linear sampling.
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ */
+static void
+lp_build_sample_image_linear(struct lp_build_sample_context *bld,
+                             LLVMValueRef width_vec,
+                             LLVMValueRef height_vec,
+                             LLVMValueRef depth_vec,
+                             LLVMValueRef row_stride_vec,
+                             LLVMValueRef img_stride_vec,
+                             LLVMValueRef data_ptr,
+                             LLVMValueRef s,
+                             LLVMValueRef t,
+                             LLVMValueRef r,
+                             LLVMValueRef *colors_lo,
+                             LLVMValueRef *colors_hi)
+{
+   const int dims = texture_dims(bld->static_state->target);
+   LLVMBuilderRef builder = bld->builder;
+   struct lp_build_context i32, h16, u8n;
+   LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
+   LLVMValueRef i32_c8, i32_c128, i32_c255;
+   LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
+   LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi;
+   LLVMValueRef r_ipart, r_fpart, r_fpart_lo, r_fpart_hi;
+   LLVMValueRef x_stride, y_stride, z_stride;
+   LLVMValueRef x_offset0, x_offset1;
+   LLVMValueRef y_offset0, y_offset1;
+   LLVMValueRef z_offset0, z_offset1;
+   LLVMValueRef offset[2][2][2]; /* [z][y][x] */
+   LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2];
+   LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */
+   LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */
+   LLVMValueRef packed_lo, packed_hi;
+   unsigned x, y, z;
+   unsigned i, j, k;
+   unsigned numj, numk;
+
+   lp_build_context_init(&i32, builder, lp_type_int_vec(32));
+   lp_build_context_init(&h16, builder, lp_type_ufixed(16));
+   lp_build_context_init(&u8n, builder, lp_type_unorm(8));
+
+   i32_vec_type = lp_build_vec_type(i32.type);
+   h16_vec_type = lp_build_vec_type(h16.type);
+   u8n_vec_type = lp_build_vec_type(u8n.type);
+
+   if (bld->static_state->normalized_coords) {
+      /* s = s * width, t = t * height */
+      LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type);
+      LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width_vec,
+                                              coord_vec_type, "");
+      s = lp_build_mul(&bld->coord_bld, s, fp_width);
+      if (dims >= 2) {
+         LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height_vec,
+                                                  coord_vec_type, "");
+         t = lp_build_mul(&bld->coord_bld, t, fp_height);
+      }
+      if (dims >= 3) {
+         LLVMValueRef fp_depth = LLVMBuildSIToFP(bld->builder, depth_vec,
+                                                 coord_vec_type, "");
+         r = lp_build_mul(&bld->coord_bld, r, fp_depth);
+      }
+   }
+
+   /* scale coords by 256 (8 fractional bits) */
+   s = lp_build_mul_imm(&bld->coord_bld, s, 256);
+   if (dims >= 2)
+      t = lp_build_mul_imm(&bld->coord_bld, t, 256);
+   if (dims >= 3)
+      r = lp_build_mul_imm(&bld->coord_bld, r, 256);
+
+   /* convert float to int */
+   s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
+   if (dims >= 2)
+      t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
+   if (dims >= 3)
+      r = LLVMBuildFPToSI(builder, r, i32_vec_type, "");
+
+   /* subtract 0.5 (add -128) */
+   i32_c128 = lp_build_const_int_vec(i32.type, -128);
+   if (!bld->static_state->force_nearest_s) {
+      s = LLVMBuildAdd(builder, s, i32_c128, "");
+   }
+   if (dims >= 2 && !bld->static_state->force_nearest_t) {
+      t = LLVMBuildAdd(builder, t, i32_c128, "");
+   }
+   if (dims >= 3) {
+      r = LLVMBuildAdd(builder, r, i32_c128, "");
+   }
+
+   /* compute floor (shift right 8) */
+   i32_c8 = lp_build_const_int_vec(i32.type, 8);
+   s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
+   if (dims >= 2)
+      t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
+   if (dims >= 3)
+      r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
+
+   /* compute fractional part (AND with 0xff) */
+   i32_c255 = lp_build_const_int_vec(i32.type, 255);
+   s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
+   if (dims >= 2)
+      t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
+   if (dims >= 3)
+      r_fpart = LLVMBuildAnd(builder, r, i32_c255, "");
+
+   /* get pixel, row and image strides */
+   x_stride = lp_build_const_vec(bld->uint_coord_bld.type,
+                                 bld->format_desc->block.bits/8);
+   y_stride = row_stride_vec;
+   z_stride = img_stride_vec;
+
+   /* do texcoord wrapping and compute texel offsets */
+   lp_build_sample_wrap_linear_int(bld,
+                                   bld->format_desc->block.width,
+                                   s_ipart, width_vec, x_stride,
+                                   bld->static_state->pot_width,
+                                   bld->static_state->wrap_s,
+                                   &x_offset0, &x_offset1,
+                                   &x_subcoord[0], &x_subcoord[1]);
+   for (z = 0; z < 2; z++) {
+      for (y = 0; y < 2; y++) {
+         offset[z][y][0] = x_offset0;
+         offset[z][y][1] = x_offset1;
+      }
+   }
+
+   if (dims >= 2) {
+      lp_build_sample_wrap_linear_int(bld,
+                                      bld->format_desc->block.height,
+                                      t_ipart, height_vec, y_stride,
+                                      bld->static_state->pot_height,
+                                      bld->static_state->wrap_t,
+                                      &y_offset0, &y_offset1,
+                                      &y_subcoord[0], &y_subcoord[1]);
+
+      for (z = 0; z < 2; z++) {
+         for (x = 0; x < 2; x++) {
+            offset[z][0][x] = lp_build_add(&bld->uint_coord_bld,
+                                           offset[z][0][x], y_offset0);
+            offset[z][1][x] = lp_build_add(&bld->uint_coord_bld,
+                                           offset[z][1][x], y_offset1);
+         }
+      }
+   }
+
+   if (dims >= 3) {
+      lp_build_sample_wrap_linear_int(bld,
+                                      bld->format_desc->block.height,
+                                      r_ipart, depth_vec, z_stride,
+                                      bld->static_state->pot_depth,
+                                      bld->static_state->wrap_r,
+                                      &z_offset0, &z_offset1,
+                                      &z_subcoord[0], &z_subcoord[1]);
+      for (y = 0; y < 2; y++) {
+         for (x = 0; x < 2; x++) {
+            offset[0][y][x] = lp_build_add(&bld->uint_coord_bld,
+                                           offset[0][y][x], z_offset0);
+            offset[1][y][x] = lp_build_add(&bld->uint_coord_bld,
+                                           offset[1][y][x], z_offset1);
+         }
+      }
+   }
+   else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+      LLVMValueRef z_offset;
+      z_offset = lp_build_mul(&bld->uint_coord_bld, r, img_stride_vec);
+      for (y = 0; y < 2; y++) {
+         for (x = 0; x < 2; x++) {
+            /* The r coord is the cube face in [0,5] */
+            offset[0][y][x] = lp_build_add(&bld->uint_coord_bld,
+                                           offset[0][y][x], z_offset);
+         }
+      }
+   }
+
+   /*
+    * Transform 4 x i32 in
+    *
+    *   s_fpart = {s0, s1, s2, s3}
+    *
+    * into 8 x i16
+    *
+    *   s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
+    *
+    * into two 8 x i16
+    *
+    *   s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
+    *   s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
+    *
+    * and likewise for t_fpart. There is no risk of loosing precision here
+    * since the fractional parts only use the lower 8bits.
+    */
+   s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
+   if (dims >= 2)
+      t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
+   if (dims >= 3)
+      r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, "");
+
+   {
+      LLVMTypeRef elem_type = LLVMInt32Type();
+      LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef shuffle_lo;
+      LLVMValueRef shuffle_hi;
+
+      for (j = 0; j < h16.type.length; j += 4) {
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+         unsigned subindex = 0;
+#else
+         unsigned subindex = 1;
+#endif
+         LLVMValueRef index;
+
+         index = LLVMConstInt(elem_type, j/2 + subindex, 0);
+         for (i = 0; i < 4; ++i)
+            shuffles_lo[j + i] = index;
+
+         index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
+         for (i = 0; i < 4; ++i)
+            shuffles_hi[j + i] = index;
+      }
+
+      shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
+      shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
+
+      s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
+                                          shuffle_lo, "");
+      s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
+                                          shuffle_hi, "");
+      if (dims >= 2) {
+         t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
+                                             shuffle_lo, "");
+         t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
+                                             shuffle_hi, "");
+      }
+      if (dims >= 3) {
+         r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
+                                             shuffle_lo, "");
+         r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
+                                             shuffle_hi, "");
+      }
+   }
+
+   /*
+    * Fetch the pixels as 4 x 32bit (rgba order might differ):
+    *
+    *   rgba0 rgba1 rgba2 rgba3
+    *
+    * bit cast them into 16 x u8
+    *
+    *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
+    *
+    * unpack them into two 8 x i16:
+    *
+    *   r0 g0 b0 a0 r1 g1 b1 a1
+    *   r2 g2 b2 a2 r3 g3 b3 a3
+    *
+    * The higher 8 bits of the resulting elements will be zero.
+    */
+   numj = 1 + (dims >= 2);
+   numk = 1 + (dims >= 3);
+
+   for (k = 0; k < numk; k++) {
+      for (j = 0; j < numj; j++) {
+         for (i = 0; i < 2; i++) {
+            LLVMValueRef rgba8;
+
+            if (util_format_is_rgba8_variant(bld->format_desc)) {
+               /*
+                * Given the format is a rgba8, just read the pixels as is,
+                * without any swizzling. Swizzling will be done later.
+                */
+               rgba8 = lp_build_gather(bld->builder,
+                                       bld->texel_type.length,
+                                       bld->format_desc->block.bits,
+                                       bld->texel_type.width,
+                                       data_ptr, offset[k][j][i]);
+
+               rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+            }
+            else {
+               rgba8 = lp_build_fetch_rgba_aos(bld->builder,
+                                               bld->format_desc,
+                                               u8n.type,
+                                               data_ptr, offset[k][j][i],
+                                               x_subcoord[i],
+                                               y_subcoord[j]);
+            }
+
+            /* Expand one 4*rgba8 to two 2*rgba16 */
+            lp_build_unpack2(builder, u8n.type, h16.type,
+                             rgba8,
+                             &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]);
+         }
+      }
+   }
+
+   /*
+    * Linear interpolation with 8.8 fixed point.
+    */
+   if (bld->static_state->force_nearest_s) {
+      /* special case 1-D lerp */
+      packed_lo = lp_build_lerp(&h16,
+                                t_fpart_lo,
+                                neighbors_lo[0][0][0],
+                                neighbors_lo[0][0][1]);
+
+      packed_hi = lp_build_lerp(&h16,
+                                t_fpart_hi,
+                                neighbors_hi[0][1][0],
+                                neighbors_hi[0][1][0]);
+   }
+   else if (bld->static_state->force_nearest_t) {
+      /* special case 1-D lerp */
+      packed_lo = lp_build_lerp(&h16,
+                                s_fpart_lo,
+                                neighbors_lo[0][0][0],
+                                neighbors_lo[0][0][1]);
+
+      packed_hi = lp_build_lerp(&h16,
+                                s_fpart_hi,
+                                neighbors_hi[0][0][0],
+                                neighbors_hi[0][0][1]);
+   }
+   else {
+      /* general 1/2/3-D lerping */
+      if (dims == 1) {
+         packed_lo = lp_build_lerp(&h16,
+                                   s_fpart_lo,
+                                   neighbors_lo[0][0][0],
+                                   neighbors_lo[0][0][1]);
+
+         packed_hi = lp_build_lerp(&h16,
+                                   s_fpart_hi,
+                                   neighbors_hi[0][0][0],
+                                   neighbors_hi[0][0][1]);
+      }
+      else {
+         /* 2-D lerp */
+         packed_lo = lp_build_lerp_2d(&h16,
+                                      s_fpart_lo, t_fpart_lo,
+                                      neighbors_lo[0][0][0],
+                                      neighbors_lo[0][0][1],
+                                      neighbors_lo[0][1][0],
+                                      neighbors_lo[0][1][1]);
+
+         packed_hi = lp_build_lerp_2d(&h16,
+                                      s_fpart_hi, t_fpart_hi,
+                                      neighbors_hi[0][0][0],
+                                      neighbors_hi[0][0][1],
+                                      neighbors_hi[0][1][0],
+                                      neighbors_hi[0][1][1]);
+
+         if (dims >= 3) {
+            LLVMValueRef packed_lo2, packed_hi2;
+
+            /* lerp in the second z slice */
+            packed_lo2 = lp_build_lerp_2d(&h16,
+                                          s_fpart_lo, t_fpart_lo,
+                                          neighbors_lo[1][0][0],
+                                          neighbors_lo[1][0][1],
+                                          neighbors_lo[1][1][0],
+                                          neighbors_lo[1][1][1]);
+
+            packed_hi2 = lp_build_lerp_2d(&h16,
+                                          s_fpart_hi, t_fpart_hi,
+                                          neighbors_hi[1][0][0],
+                                          neighbors_hi[1][0][1],
+                                          neighbors_hi[1][1][0],
+                                          neighbors_hi[1][1][1]);
+            /* interp between two z slices */
+            packed_lo = lp_build_lerp(&h16, r_fpart_lo,
+                                      packed_lo, packed_lo2);
+            packed_hi = lp_build_lerp(&h16, r_fpart_hi,
+                                      packed_hi, packed_hi2);
+         }
+      }
+   }
+
+   *colors_lo = packed_lo;
+   *colors_hi = packed_hi;
+}
+
+
+/**
+ * Sample the texture/mipmap using given image filter and mip filter.
+ * data0_ptr and data1_ptr point to the two mipmap levels to sample
+ * from.  width0/1_vec, height0/1_vec, depth0/1_vec indicate their sizes.
+ * If we're using nearest miplevel sampling the '1' values will be null/unused.
+ */
+static void
+lp_build_sample_mipmap(struct lp_build_sample_context *bld,
+                       unsigned img_filter,
+                       unsigned mip_filter,
+                       LLVMValueRef s,
+                       LLVMValueRef t,
+                       LLVMValueRef r,
+                       LLVMValueRef lod_fpart,
+                       LLVMValueRef width0_vec,
+                       LLVMValueRef width1_vec,
+                       LLVMValueRef height0_vec,
+                       LLVMValueRef height1_vec,
+                       LLVMValueRef depth0_vec,
+                       LLVMValueRef depth1_vec,
+                       LLVMValueRef row_stride0_vec,
+                       LLVMValueRef row_stride1_vec,
+                       LLVMValueRef img_stride0_vec,
+                       LLVMValueRef img_stride1_vec,
+                       LLVMValueRef data_ptr0,
+                       LLVMValueRef data_ptr1,
+                       LLVMValueRef *colors_lo,
+                       LLVMValueRef *colors_hi)
+{
+   LLVMValueRef colors0_lo, colors0_hi;
+   LLVMValueRef colors1_lo, colors1_hi;
+
+   if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+      /* sample the first mipmap level */
+      lp_build_sample_image_nearest(bld,
+                                    width0_vec, height0_vec, depth0_vec,
+                                    row_stride0_vec, img_stride0_vec,
+                                    data_ptr0, s, t, r,
+                                    &colors0_lo, &colors0_hi);
+
+      if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+         /* sample the second mipmap level */
+         lp_build_sample_image_nearest(bld,
+                                       width1_vec, height1_vec, depth1_vec,
+                                       row_stride1_vec, img_stride1_vec,
+                                       data_ptr1, s, t, r,
+                                       &colors1_lo, &colors1_hi);
+      }
+   }
+   else {
+      assert(img_filter == PIPE_TEX_FILTER_LINEAR);
+
+      /* sample the first mipmap level */
+      lp_build_sample_image_linear(bld,
+                                   width0_vec, height0_vec, depth0_vec,
+                                   row_stride0_vec, img_stride0_vec,
+                                   data_ptr0, s, t, r,
+                                   &colors0_lo, &colors0_hi);
+
+      if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+         /* sample the second mipmap level */
+         lp_build_sample_image_linear(bld,
+                                      width1_vec, height1_vec, depth1_vec,
+                                      row_stride1_vec, img_stride1_vec,
+                                      data_ptr1, s, t, r,
+                                      &colors1_lo, &colors1_hi);
+      }
+   }
+
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+      /* interpolate samples from the two mipmap levels */
+      struct lp_build_context h16;
+      lp_build_context_init(&h16, bld->builder, lp_type_ufixed(16));
+
+      *colors_lo = lp_build_lerp(&h16, lod_fpart,
+                                 colors0_lo, colors1_lo);
+      *colors_hi = lp_build_lerp(&h16, lod_fpart,
+                                 colors0_hi, colors1_hi);
+   }
+   else {
+      /* use first/only level's colors */
+      *colors_lo = colors0_lo;
+      *colors_hi = colors0_hi;
+   }
+}
+
+
+
+/**
+ * Texture sampling in AoS format.  Used when sampling common 32-bit/texel
+ * formats.  1D/2D/3D/cube texture supported.  All mipmap sampling modes
+ * but only limited texture coord wrap modes.
+ */
+void
+lp_build_sample_aos(struct lp_build_sample_context *bld,
+                    unsigned unit,
+                    LLVMValueRef s,
+                    LLVMValueRef t,
+                    LLVMValueRef r,
+                    const LLVMValueRef *ddx,
+                    const LLVMValueRef *ddy,
+                    LLVMValueRef lod_bias, /* optional */
+                    LLVMValueRef explicit_lod, /* optional */
+                    LLVMValueRef width,
+                    LLVMValueRef height,
+                    LLVMValueRef depth,
+                    LLVMValueRef width_vec,
+                    LLVMValueRef height_vec,
+                    LLVMValueRef depth_vec,
+                    LLVMValueRef row_stride_array,
+                    LLVMValueRef img_stride_array,
+                    LLVMValueRef data_array,
+                    LLVMValueRef texel_out[4])
+{
+   struct lp_build_context *float_bld = &bld->float_bld;
+   LLVMBuilderRef builder = bld->builder;
+   const unsigned mip_filter = bld->static_state->min_mip_filter;
+   const unsigned min_filter = bld->static_state->min_img_filter;
+   const unsigned mag_filter = bld->static_state->mag_img_filter;
+   const int dims = texture_dims(bld->static_state->target);
+   LLVMValueRef lod = NULL, lod_fpart = NULL;
+   LLVMValueRef ilevel0, ilevel1 = NULL;
+   LLVMValueRef width0_vec = NULL, height0_vec = NULL, depth0_vec = NULL;
+   LLVMValueRef width1_vec = NULL, height1_vec = NULL, depth1_vec = NULL;
+   LLVMValueRef row_stride0_vec = NULL, row_stride1_vec = NULL;
+   LLVMValueRef img_stride0_vec = NULL, img_stride1_vec = NULL;
+   LLVMValueRef data_ptr0, data_ptr1 = NULL;
+   LLVMValueRef packed, packed_lo, packed_hi;
+   LLVMValueRef unswizzled[4];
+   LLVMValueRef face_ddx[4], face_ddy[4];
+   struct lp_build_context h16;
+   LLVMTypeRef h16_vec_type;
+
+   /* we only support the common/simple wrap modes at this time */
+   assert(lp_is_simple_wrap_mode(bld->static_state->wrap_s));
+   if (dims >= 2)
+      assert(lp_is_simple_wrap_mode(bld->static_state->wrap_t));
+   if (dims >= 3)
+      assert(lp_is_simple_wrap_mode(bld->static_state->wrap_r));
+
+
+   /* make 16-bit fixed-pt builder context */
+   lp_build_context_init(&h16, builder, lp_type_ufixed(16));
+   h16_vec_type = lp_build_vec_type(h16.type);
+
+
+   /* cube face selection, compute pre-face coords, etc. */
+   if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+      LLVMValueRef face, face_s, face_t;
+      lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
+      s = face_s; /* vec */
+      t = face_t; /* vec */
+      /* use 'r' to indicate cube face */
+      r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
+
+      /* recompute ddx, ddy using the new (s,t) face texcoords */
+      face_ddx[0] = lp_build_ddx(&bld->coord_bld, s);
+      face_ddx[1] = lp_build_ddx(&bld->coord_bld, t);
+      face_ddx[2] = NULL;
+      face_ddx[3] = NULL;
+      face_ddy[0] = lp_build_ddy(&bld->coord_bld, s);
+      face_ddy[1] = lp_build_ddy(&bld->coord_bld, t);
+      face_ddy[2] = NULL;
+      face_ddy[3] = NULL;
+      ddx = face_ddx;
+      ddy = face_ddy;
+   }
+
+
+   /*
+    * Compute the level of detail (float).
+    */
+   if (min_filter != mag_filter ||
+       mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+      /* Need to compute lod either to choose mipmap levels or to
+       * distinguish between minification/magnification with one mipmap level.
+       */
+      lod = lp_build_lod_selector(bld, ddx, ddy,
+                                  lod_bias, explicit_lod,
+                                  width, height, depth);
+   }
+
+   /*
+    * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
+    * If mipfilter=linear, also compute the weight between the two
+    * mipmap levels: lod_fpart
+    */
+   switch (mip_filter) {
+   default:
+      assert(0 && "bad mip_filter value in lp_build_sample_aos()");
+      /* fall-through */
+   case PIPE_TEX_MIPFILTER_NONE:
+      /* always use mip level 0 */
+      if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         /* XXX this is a work-around for an apparent bug in LLVM 2.7.
+          * We should be able to set ilevel0 = const(0) but that causes
+          * bad x86 code to be emitted.
+          */
+         lod = lp_build_const_elem(bld->coord_bld.type, 0.0);
+         lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
+      }
+      else {
+         ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+      }
+      break;
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      assert(lod);
+      lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
+      break;
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      {
+         LLVMValueRef f256 = LLVMConstReal(LLVMFloatType(), 256.0);
+         LLVMValueRef i255 = lp_build_const_int32(255);
+         LLVMTypeRef i16_type = LLVMIntType(16);
+
+         assert(lod);
+
+         lp_build_linear_mip_levels(bld, unit, lod, &ilevel0, &ilevel1,
+                                    &lod_fpart);
+         lod_fpart = LLVMBuildFMul(builder, lod_fpart, f256, "");
+         lod_fpart = lp_build_ifloor(&bld->float_bld, lod_fpart);
+         lod_fpart = LLVMBuildAnd(builder, lod_fpart, i255, "");
+         lod_fpart = LLVMBuildTrunc(builder, lod_fpart, i16_type, "");
+         lod_fpart = lp_build_broadcast_scalar(&h16, lod_fpart);
+
+         /* the lod_fpart values will be fixed pt values in [0,1) */
+      }
+      break;
+   }
+
+   /* compute image size(s) of source mipmap level(s) */
+   lp_build_mipmap_level_sizes(bld, dims, width_vec, height_vec, depth_vec,
+                               ilevel0, ilevel1,
+                               row_stride_array, img_stride_array,
+                               &width0_vec, &width1_vec,
+                               &height0_vec, &height1_vec,
+                               &depth0_vec, &depth1_vec,
+                               &row_stride0_vec, &row_stride1_vec,
+                               &img_stride0_vec, &img_stride1_vec);
+
+   /*
+    * Get pointer(s) to image data for mipmap level(s).
+    */
+   data_ptr0 = lp_build_get_mipmap_level(bld, data_array, ilevel0);
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+      data_ptr1 = lp_build_get_mipmap_level(bld, data_array, ilevel1);
+   }
+
+
+   /*
+    * Get/interpolate texture colors.
+    */
+   if (min_filter == mag_filter) {
+      /* no need to distinquish between minification and magnification */
+      lp_build_sample_mipmap(bld, min_filter, mip_filter,
+                             s, t, r, lod_fpart,
+                             width0_vec, width1_vec,
+                             height0_vec, height1_vec,
+                             depth0_vec, depth1_vec,
+                             row_stride0_vec, row_stride1_vec,
+                             img_stride0_vec, img_stride1_vec,
+                             data_ptr0, data_ptr1,
+                             &packed_lo, &packed_hi);
+   }
+   else {
+      /* Emit conditional to choose min image filter or mag image filter
+       * depending on the lod being > 0 or <= 0, respectively.
+       */
+      struct lp_build_flow_context *flow_ctx;
+      struct lp_build_if_state if_ctx;
+      LLVMValueRef minify;
+
+      flow_ctx = lp_build_flow_create(builder);
+      lp_build_flow_scope_begin(flow_ctx);
+
+      packed_lo = LLVMGetUndef(h16_vec_type);
+      packed_hi = LLVMGetUndef(h16_vec_type);
+
+      lp_build_flow_scope_declare(flow_ctx, &packed_lo);
+      lp_build_flow_scope_declare(flow_ctx, &packed_hi);
+
+      /* minify = lod > 0.0 */
+      minify = LLVMBuildFCmp(builder, LLVMRealUGE,
+                             lod, float_bld->zero, "");
+
+      lp_build_if(&if_ctx, flow_ctx, builder, minify);
+      {
+         /* Use the minification filter */
+         lp_build_sample_mipmap(bld, min_filter, mip_filter,
+                                s, t, r, lod_fpart,
+                                width0_vec, width1_vec,
+                                height0_vec, height1_vec,
+                                depth0_vec, depth1_vec,
+                                row_stride0_vec, row_stride1_vec,
+                                img_stride0_vec, img_stride1_vec,
+                                data_ptr0, data_ptr1,
+                                &packed_lo, &packed_hi);
+      }
+      lp_build_else(&if_ctx);
+      {
+         /* Use the magnification filter */
+         lp_build_sample_mipmap(bld, mag_filter, mip_filter,
+                                s, t, r, lod_fpart,
+                                width0_vec, width1_vec,
+                                height0_vec, height1_vec,
+                                depth0_vec, depth1_vec,
+                                row_stride0_vec, row_stride1_vec,
+                                img_stride0_vec, img_stride1_vec,
+                                data_ptr0, data_ptr1,
+                                &packed_lo, &packed_hi);
+      }
+      lp_build_endif(&if_ctx);
+
+      lp_build_flow_scope_end(flow_ctx);
+      lp_build_flow_destroy(flow_ctx);
+   }
+
+   /* combine 'packed_lo', 'packed_hi' into 'packed' */
+   {
+      struct lp_build_context h16, u8n;
+
+      lp_build_context_init(&h16, builder, lp_type_ufixed(16));
+      lp_build_context_init(&u8n, builder, lp_type_unorm(8));
+
+      packed = lp_build_pack2(builder, h16.type, u8n.type,
+                              packed_lo, packed_hi);
+   }
+
+   /*
+    * Convert to SoA and swizzle.
+    */
+   lp_build_rgba8_to_f32_soa(builder,
+                             bld->texel_type,
+                             packed, unswizzled);
+
+   if (util_format_is_rgba8_variant(bld->format_desc)) {
+      lp_build_format_swizzle_soa(bld->format_desc,
+                                  &bld->texel_bld,
+                                  unswizzled, texel_out);
+   }
+   else {
+      texel_out[0] = unswizzled[0];
+      texel_out[1] = unswizzled[1];
+      texel_out[2] = unswizzled[2];
+      texel_out[3] = unswizzled[3];
+   }
+
+   apply_sampler_swizzle(bld, texel_out);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
new file mode 100644
index 0000000..e1045bb
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
@@ -0,0 +1,65 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Texture sampling -- SoA.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ * @author Brian Paul <brianp@vmware.com>
+ */
+
+#ifndef LP_BLD_SAMPLE_AOS_H
+#define LP_BLD_SAMPLE_AOS_H
+
+
+#include "lp_bld_sample.h"
+
+
+void
+lp_build_sample_aos(struct lp_build_sample_context *bld,
+                    unsigned unit,
+                    LLVMValueRef s,
+                    LLVMValueRef t,
+                    LLVMValueRef r,
+                    const LLVMValueRef *ddx,
+                    const LLVMValueRef *ddy,
+                    LLVMValueRef lod_bias, /* optional */
+                    LLVMValueRef explicit_lod, /* optional */
+                    LLVMValueRef width,
+                    LLVMValueRef height,
+                    LLVMValueRef depth,
+                    LLVMValueRef width_vec,
+                    LLVMValueRef height_vec,
+                    LLVMValueRef depth_vec,
+                    LLVMValueRef row_stride_array,
+                    LLVMValueRef img_stride_array,
+                    LLVMValueRef data_array,
+                    LLVMValueRef texel_out[4]);
+
+
+#endif /* LP_BLD_SAMPLE_AOS_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index baf0402..f61f23e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -40,6 +40,7 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "util/u_format.h"
+#include "util/u_cpu_detect.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -52,49 +53,11 @@
 #include "lp_bld_gather.h"
 #include "lp_bld_format.h"
 #include "lp_bld_sample.h"
+#include "lp_bld_sample_aos.h"
 #include "lp_bld_quad.h"
 
 
 /**
- * Keep all information for sampling code generation in a single place.
- */
-struct lp_build_sample_context
-{
-   LLVMBuilderRef builder;
-
-   const struct lp_sampler_static_state *static_state;
-
-   struct lp_sampler_dynamic_state *dynamic_state;
-
-   const struct util_format_description *format_desc;
-
-   /** regular scalar float type */
-   struct lp_type float_type;
-   struct lp_build_context float_bld;
-
-   /** regular scalar float type */
-   struct lp_type int_type;
-   struct lp_build_context int_bld;
-
-   /** Incoming coordinates type and build context */
-   struct lp_type coord_type;
-   struct lp_build_context coord_bld;
-
-   /** Unsigned integer coordinates */
-   struct lp_type uint_coord_type;
-   struct lp_build_context uint_coord_bld;
-
-   /** Signed integer coordinates */
-   struct lp_type int_coord_type;
-   struct lp_build_context int_coord_bld;
-
-   /** Output texels type and build context */
-   struct lp_type texel_type;
-   struct lp_build_context texel_bld;
-};
-
-
-/**
  * Does the given texture wrap mode allow sampling the texture border color?
  * XXX maybe move this into gallium util code.
  */
@@ -119,95 +82,10 @@ wrap_mode_uses_border_color(unsigned mode)
 }
 
 
-static LLVMValueRef
-lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
-                          LLVMValueRef data_array, LLVMValueRef level)
-{
-   LLVMValueRef indexes[2], data_ptr;
-   indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
-   indexes[1] = level;
-   data_ptr = LLVMBuildGEP(bld->builder, data_array, indexes, 2, "");
-   data_ptr = LLVMBuildLoad(bld->builder, data_ptr, "");
-   return data_ptr;
-}
-
-
-static LLVMValueRef
-lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
-                                LLVMValueRef data_array, int level)
-{
-   LLVMValueRef lvl = LLVMConstInt(LLVMInt32Type(), level, 0);
-   return lp_build_get_mipmap_level(bld, data_array, lvl);
-}
-
-
-/**
- * Dereference stride_array[mipmap_level] array to get a stride.
- * Return stride as a vector.
- */
-static LLVMValueRef
-lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
-                              LLVMValueRef stride_array, LLVMValueRef level)
-{
-   LLVMValueRef indexes[2], stride;
-   indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
-   indexes[1] = level;
-   stride = LLVMBuildGEP(bld->builder, stride_array, indexes, 2, "");
-   stride = LLVMBuildLoad(bld->builder, stride, "");
-   stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride);
-   return stride;
-}
-
-
-/** Dereference stride_array[0] array to get a stride (as vector). */
-static LLVMValueRef
-lp_build_get_const_level_stride_vec(struct lp_build_sample_context *bld,
-                                    LLVMValueRef stride_array, int level)
-{
-   LLVMValueRef lvl = LLVMConstInt(LLVMInt32Type(), level, 0);
-   return lp_build_get_level_stride_vec(bld, stride_array, lvl);
-}
-
-
-static int
-texture_dims(enum pipe_texture_target tex)
-{
-   switch (tex) {
-   case PIPE_TEXTURE_1D:
-      return 1;
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_RECT:
-   case PIPE_TEXTURE_CUBE:
-      return 2;
-   case PIPE_TEXTURE_3D:
-      return 3;
-   default:
-      assert(0 && "bad texture target in texture_dims()");
-      return 2;
-   }
-}
-
-
-static void
-apply_sampler_swizzle(struct lp_build_sample_context *bld,
-                      LLVMValueRef *texel)
-{
-   unsigned char swizzles[4];
-
-   swizzles[0] = bld->static_state->swizzle_r;
-   swizzles[1] = bld->static_state->swizzle_g;
-   swizzles[2] = bld->static_state->swizzle_b;
-   swizzles[3] = bld->static_state->swizzle_a;
-
-   lp_build_swizzle_soa_inplace(&bld->texel_bld, texel, swizzles);
-}
-
-
-
 /**
  * Generate code to fetch a texel from a texture at int coords (x, y, z).
  * The computation depends on whether the texture is 1D, 2D or 3D.
- * The result, texel, will be:
+ * The result, texel, will be float vectors:
  *   texel[0] = red values
  *   texel[1] = green values
  *   texel[2] = blue values
@@ -356,204 +234,6 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld,
 
 
 /**
- * We only support a few wrap modes in lp_build_sample_wrap_linear_int() at this time.
- * Return whether the given mode is supported by that function.
- */
-static boolean
-is_simple_wrap_mode(unsigned mode)
-{
-   switch (mode) {
-   case PIPE_TEX_WRAP_REPEAT:
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      return TRUE;
-   default:
-      return FALSE;
-   }
-}
-
-
-/**
- * Build LLVM code for texture wrap mode, for scaled integer texcoords.
- * \param coord  the incoming texcoord (s,t,r or q) scaled to the texture size
- * \param length  the texture size along one dimension
- * \param is_pot  if TRUE, length is a power of two
- * \param wrap_mode  one of PIPE_TEX_WRAP_x
- * \param i0  resulting sub-block pixel coordinate for coord0
- */
-static void
-lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
-                                 unsigned block_length,
-                                 LLVMValueRef coord,
-                                 LLVMValueRef length,
-                                 LLVMValueRef stride,
-                                 boolean is_pot,
-                                 unsigned wrap_mode,
-                                 LLVMValueRef *out_offset,
-                                 LLVMValueRef *out_i)
-{
-   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
-   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   LLVMValueRef length_minus_one;
-
-   length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
-
-   switch(wrap_mode) {
-   case PIPE_TEX_WRAP_REPEAT:
-      if(is_pot)
-         coord = LLVMBuildAnd(bld->builder, coord, length_minus_one, "");
-      else
-         /* Signed remainder won't give the right results for negative
-          * dividends but unsigned remainder does.*/
-         coord = LLVMBuildURem(bld->builder, coord, length, "");
-      break;
-
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
-      coord = lp_build_min(int_coord_bld, coord, length_minus_one);
-      break;
-
-   case PIPE_TEX_WRAP_CLAMP:
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-   default:
-      assert(0);
-   }
-
-   lp_build_sample_partial_offset(uint_coord_bld, block_length, coord, stride,
-                                  out_offset, out_i);
-}
-
-
-/**
- * Build LLVM code for texture wrap mode, for scaled integer texcoords.
- * \param coord0  the incoming texcoord (s,t,r or q) scaled to the texture size
- * \param length  the texture size along one dimension
- * \param stride  pixel stride along the coordinate axis
- * \param block_length  is the length of the pixel block along the
- *                      coordinate axis
- * \param is_pot  if TRUE, length is a power of two
- * \param wrap_mode  one of PIPE_TEX_WRAP_x
- * \param offset0  resulting relative offset for coord0
- * \param offset1  resulting relative offset for coord0 + 1
- * \param i0  resulting sub-block pixel coordinate for coord0
- * \param i1  resulting sub-block pixel coordinate for coord0 + 1
- */
-static void
-lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
-                                unsigned block_length,
-                                LLVMValueRef coord0,
-                                LLVMValueRef length,
-                                LLVMValueRef stride,
-                                boolean is_pot,
-                                unsigned wrap_mode,
-                                LLVMValueRef *offset0,
-                                LLVMValueRef *offset1,
-                                LLVMValueRef *i0,
-                                LLVMValueRef *i1)
-{
-   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
-   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   LLVMValueRef length_minus_one;
-   LLVMValueRef lmask, umask, mask;
-
-   if (block_length != 1) {
-      /*
-       * If the pixel block covers more than one pixel then there is no easy
-       * way to calculate offset1 relative to offset0. Instead, compute them
-       * independently.
-       */
-
-      LLVMValueRef coord1;
-
-      lp_build_sample_wrap_nearest_int(bld,
-                                       block_length,
-                                       coord0,
-                                       length,
-                                       stride,
-                                       is_pot,
-                                       wrap_mode,
-                                       offset0, i0);
-
-      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
-
-      lp_build_sample_wrap_nearest_int(bld,
-                                       block_length,
-                                       coord1,
-                                       length,
-                                       stride,
-                                       is_pot,
-                                       wrap_mode,
-                                       offset1, i1);
-
-      return;
-   }
-
-   /*
-    * Scalar pixels -- try to compute offset0 and offset1 with a single stride
-    * multiplication.
-    */
-
-   *i0 = uint_coord_bld->zero;
-   *i1 = uint_coord_bld->zero;
-
-   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
-
-   switch(wrap_mode) {
-   case PIPE_TEX_WRAP_REPEAT:
-      if (is_pot) {
-         coord0 = LLVMBuildAnd(bld->builder, coord0, length_minus_one, "");
-      }
-      else {
-         /* Signed remainder won't give the right results for negative
-          * dividends but unsigned remainder does.*/
-         coord0 = LLVMBuildURem(bld->builder, coord0, length, "");
-      }
-
-      mask = lp_build_compare(bld->builder, int_coord_bld->type,
-                              PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
-
-      *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
-      *offset1 = LLVMBuildAnd(bld->builder,
-                              lp_build_add(uint_coord_bld, *offset0, stride),
-                              mask, "");
-      break;
-
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      lmask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type,
-                               PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
-      umask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type,
-                               PIPE_FUNC_LESS, coord0, length_minus_one);
-
-      coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero);
-      coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one);
-
-      mask = LLVMBuildAnd(bld->builder, lmask, umask, "");
-
-      *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
-      *offset1 = lp_build_add(uint_coord_bld,
-                              *offset0,
-                              LLVMBuildAnd(bld->builder, stride, mask, ""));
-      break;
-
-   case PIPE_TEX_WRAP_CLAMP:
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-   default:
-      assert(0);
-      *offset0 = uint_coord_bld->zero;
-      *offset1 = uint_coord_bld->zero;
-      break;
-   }
-}
-
-
-/**
  * Build LLVM code for texture wrap mode for linear filtering.
  * \param x0_out  returns first integer texcoord
  * \param x1_out  returns second integer texcoord
@@ -765,7 +445,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 /**
  * Build LLVM code for texture wrap mode for nearest filtering.
  * \param coord  the incoming texcoord (nominally in [0,1])
- * \param length  the texture size along one dimension, as int
+ * \param length  the texture size along one dimension, as int vector
  * \param is_pot  if TRUE, length is a power of two
  * \param wrap_mode  one of PIPE_TEX_WRAP_x
  */
@@ -882,198 +562,6 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
 
 
 /**
- * Codegen equivalent for u_minify().
- * Return max(1, base_size >> level);
- */
-static LLVMValueRef
-lp_build_minify(struct lp_build_sample_context *bld,
-                LLVMValueRef base_size,
-                LLVMValueRef level)
-{
-   LLVMValueRef size = LLVMBuildLShr(bld->builder, base_size, level, "minify");
-   size = lp_build_max(&bld->int_coord_bld, size, bld->int_coord_bld.one);
-   return size;
-}
-
-
-/**
- * Generate code to compute texture level of detail (lambda).
- * \param ddx  partial derivatives of (s, t, r, q) with respect to X
- * \param ddy  partial derivatives of (s, t, r, q) with respect to Y
- * \param lod_bias  optional float vector with the shader lod bias
- * \param explicit_lod  optional float vector with the explicit lod
- * \param width  scalar int texture width
- * \param height  scalar int texture height
- * \param depth  scalar int texture depth
- *
- * XXX: The resulting lod is scalar, so ignore all but the first element of
- * derivatives, lod_bias, etc that are passed by the shader.
- */
-static LLVMValueRef
-lp_build_lod_selector(struct lp_build_sample_context *bld,
-                      const LLVMValueRef ddx[4],
-                      const LLVMValueRef ddy[4],
-                      LLVMValueRef lod_bias, /* optional */
-                      LLVMValueRef explicit_lod, /* optional */
-                      LLVMValueRef width,
-                      LLVMValueRef height,
-                      LLVMValueRef depth)
-
-{
-   if (bld->static_state->min_lod == bld->static_state->max_lod) {
-      /* User is forcing sampling from a particular mipmap level.
-       * This is hit during mipmap generation.
-       */
-      return LLVMConstReal(LLVMFloatType(), bld->static_state->min_lod);
-   }
-   else {
-      struct lp_build_context *float_bld = &bld->float_bld;
-      LLVMValueRef sampler_lod_bias = LLVMConstReal(LLVMFloatType(),
-                                                    bld->static_state->lod_bias);
-      LLVMValueRef min_lod = LLVMConstReal(LLVMFloatType(),
-                                           bld->static_state->min_lod);
-      LLVMValueRef max_lod = LLVMConstReal(LLVMFloatType(),
-                                           bld->static_state->max_lod);
-      LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
-      LLVMValueRef lod;
-
-      if (explicit_lod) {
-         lod = LLVMBuildExtractElement(bld->builder, explicit_lod,
-                                       index0, "");
-      }
-      else {
-         const int dims = texture_dims(bld->static_state->target);
-         LLVMValueRef dsdx, dsdy;
-         LLVMValueRef dtdx = NULL, dtdy = NULL, drdx = NULL, drdy = NULL;
-         LLVMValueRef rho;
-
-         dsdx = LLVMBuildExtractElement(bld->builder, ddx[0], index0, "dsdx");
-         dsdx = lp_build_abs(float_bld, dsdx);
-         dsdy = LLVMBuildExtractElement(bld->builder, ddy[0], index0, "dsdy");
-         dsdy = lp_build_abs(float_bld, dsdy);
-         if (dims > 1) {
-            dtdx = LLVMBuildExtractElement(bld->builder, ddx[1], index0, "dtdx");
-            dtdx = lp_build_abs(float_bld, dtdx);
-            dtdy = LLVMBuildExtractElement(bld->builder, ddy[1], index0, "dtdy");
-            dtdy = lp_build_abs(float_bld, dtdy);
-            if (dims > 2) {
-               drdx = LLVMBuildExtractElement(bld->builder, ddx[2], index0, "drdx");
-               drdx = lp_build_abs(float_bld, drdx);
-               drdy = LLVMBuildExtractElement(bld->builder, ddy[2], index0, "drdy");
-               drdy = lp_build_abs(float_bld, drdy);
-            }
-         }
-
-         /* Compute rho = max of all partial derivatives scaled by texture size.
-          * XXX this could be vectorized somewhat
-          */
-         rho = LLVMBuildFMul(bld->builder,
-                            lp_build_max(float_bld, dsdx, dsdy),
-                            lp_build_int_to_float(float_bld, width), "");
-         if (dims > 1) {
-            LLVMValueRef max;
-            max = LLVMBuildFMul(bld->builder,
-                               lp_build_max(float_bld, dtdx, dtdy),
-                               lp_build_int_to_float(float_bld, height), "");
-            rho = lp_build_max(float_bld, rho, max);
-            if (dims > 2) {
-               max = LLVMBuildFMul(bld->builder,
-                                  lp_build_max(float_bld, drdx, drdy),
-                                  lp_build_int_to_float(float_bld, depth), "");
-               rho = lp_build_max(float_bld, rho, max);
-            }
-         }
-
-         /* compute lod = log2(rho) */
-         lod = lp_build_log2(float_bld, rho);
-
-         /* add shader lod bias */
-         if (lod_bias) {
-            lod_bias = LLVMBuildExtractElement(bld->builder, lod_bias,
-                                               index0, "");
-            lod = LLVMBuildFAdd(bld->builder, lod, lod_bias, "shader_lod_bias");
-         }
-      }
-
-      /* add sampler lod bias */
-      lod = LLVMBuildFAdd(bld->builder, lod, sampler_lod_bias, "sampler_lod_bias");
-
-      /* clamp lod */
-      lod = lp_build_clamp(float_bld, lod, min_lod, max_lod);
-
-      return lod;
-   }
-}
-
-
-/**
- * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
- * mipmap level index.
- * Note: this is all scalar code.
- * \param lod  scalar float texture level of detail
- * \param level_out  returns integer 
- */
-static void
-lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
-                           unsigned unit,
-                           LLVMValueRef lod,
-                           LLVMValueRef *level_out)
-{
-   struct lp_build_context *float_bld = &bld->float_bld;
-   struct lp_build_context *int_bld = &bld->int_bld;
-   LLVMValueRef last_level, level;
-
-   LLVMValueRef zero = LLVMConstInt(LLVMInt32Type(), 0, 0);
-
-   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
-                                               bld->builder, unit);
-
-   /* convert float lod to integer */
-   level = lp_build_iround(float_bld, lod);
-
-   /* clamp level to legal range of levels */
-   *level_out = lp_build_clamp(int_bld, level, zero, last_level);
-}
-
-
-/**
- * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to
- * two (adjacent) mipmap level indexes.  Later, we'll sample from those
- * two mipmap levels and interpolate between them.
- */
-static void
-lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
-                           unsigned unit,
-                           LLVMValueRef lod,
-                           LLVMValueRef *level0_out,
-                           LLVMValueRef *level1_out,
-                           LLVMValueRef *weight_out)
-{
-   struct lp_build_context *float_bld = &bld->float_bld;
-   struct lp_build_context *int_bld = &bld->int_bld;
-   LLVMValueRef last_level, level;
-
-   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
-                                               bld->builder, unit);
-
-   /* convert float lod to integer */
-   level = lp_build_ifloor(float_bld, lod);
-
-   /* compute level 0 and clamp to legal range of levels */
-   *level0_out = lp_build_clamp(int_bld, level,
-                                int_bld->zero,
-                                last_level);
-   /* compute level 1 and clamp to legal range of levels */
-   level = lp_build_add(int_bld, level, int_bld->one);
-   *level1_out = lp_build_clamp(int_bld, level,
-                                int_bld->zero,
-                                last_level);
-
-   *weight_out = lp_build_fract(float_bld, lod);
-}
-
-
-/**
  * Generate code to sample a mipmap level with nearest filtering.
  * If sampling a cube texture, r = cube face in [0,5].
  */
@@ -1291,207 +779,6 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
 }
 
 
-/** Helper used by lp_build_cube_lookup() */
-static LLVMValueRef
-lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
-{
-   /* ima = -0.5 / abs(coord); */
-   LLVMValueRef negHalf = lp_build_const_vec(coord_bld->type, -0.5);
-   LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
-   LLVMValueRef ima = lp_build_div(coord_bld, negHalf, absCoord);
-   return ima;
-}
-
-
-/**
- * Helper used by lp_build_cube_lookup()
- * \param sign  scalar +1 or -1
- * \param coord  float vector
- * \param ima  float vector
- */
-static LLVMValueRef
-lp_build_cube_coord(struct lp_build_context *coord_bld,
-                    LLVMValueRef sign, int negate_coord,
-                    LLVMValueRef coord, LLVMValueRef ima)
-{
-   /* return negate(coord) * ima * sign + 0.5; */
-   LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5);
-   LLVMValueRef res;
-
-   assert(negate_coord == +1 || negate_coord == -1);
-
-   if (negate_coord == -1) {
-      coord = lp_build_negate(coord_bld, coord);
-   }
-
-   res = lp_build_mul(coord_bld, coord, ima);
-   if (sign) {
-      sign = lp_build_broadcast_scalar(coord_bld, sign);
-      res = lp_build_mul(coord_bld, res, sign);
-   }
-   res = lp_build_add(coord_bld, res, half);
-
-   return res;
-}
-
-
-/** Helper used by lp_build_cube_lookup()
- * Return (major_coord >= 0) ? pos_face : neg_face;
- */
-static LLVMValueRef
-lp_build_cube_face(struct lp_build_sample_context *bld,
-                   LLVMValueRef major_coord,
-                   unsigned pos_face, unsigned neg_face)
-{
-   LLVMValueRef cmp = LLVMBuildFCmp(bld->builder, LLVMRealUGE,
-                                    major_coord,
-                                    bld->float_bld.zero, "");
-   LLVMValueRef pos = LLVMConstInt(LLVMInt32Type(), pos_face, 0);
-   LLVMValueRef neg = LLVMConstInt(LLVMInt32Type(), neg_face, 0);
-   LLVMValueRef res = LLVMBuildSelect(bld->builder, cmp, pos, neg, "");
-   return res;
-}
-
-
-
-/**
- * Generate code to do cube face selection and compute per-face texcoords.
- */
-static void
-lp_build_cube_lookup(struct lp_build_sample_context *bld,
-                     LLVMValueRef s,
-                     LLVMValueRef t,
-                     LLVMValueRef r,
-                     LLVMValueRef *face,
-                     LLVMValueRef *face_s,
-                     LLVMValueRef *face_t)
-{
-   struct lp_build_context *float_bld = &bld->float_bld;
-   struct lp_build_context *coord_bld = &bld->coord_bld;
-   LLVMValueRef rx, ry, rz;
-   LLVMValueRef arx, ary, arz;
-   LLVMValueRef c25 = LLVMConstReal(LLVMFloatType(), 0.25);
-   LLVMValueRef arx_ge_ary, arx_ge_arz;
-   LLVMValueRef ary_ge_arx, ary_ge_arz;
-   LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
-   LLVMValueRef rx_pos, ry_pos, rz_pos;
-
-   assert(bld->coord_bld.type.length == 4);
-
-   /*
-    * Use the average of the four pixel's texcoords to choose the face.
-    */
-   rx = lp_build_mul(float_bld, c25,
-                     lp_build_sum_vector(&bld->coord_bld, s));
-   ry = lp_build_mul(float_bld, c25,
-                     lp_build_sum_vector(&bld->coord_bld, t));
-   rz = lp_build_mul(float_bld, c25,
-                     lp_build_sum_vector(&bld->coord_bld, r));
-
-   arx = lp_build_abs(float_bld, rx);
-   ary = lp_build_abs(float_bld, ry);
-   arz = lp_build_abs(float_bld, rz);
-
-   /*
-    * Compare sign/magnitude of rx,ry,rz to determine face
-    */
-   arx_ge_ary = LLVMBuildFCmp(bld->builder, LLVMRealUGE, arx, ary, "");
-   arx_ge_arz = LLVMBuildFCmp(bld->builder, LLVMRealUGE, arx, arz, "");
-   ary_ge_arx = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ary, arx, "");
-   ary_ge_arz = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ary, arz, "");
-
-   arx_ge_ary_arz = LLVMBuildAnd(bld->builder, arx_ge_ary, arx_ge_arz, "");
-   ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, "");
-
-   rx_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rx, float_bld->zero, "");
-   ry_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ry, float_bld->zero, "");
-   rz_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rz, float_bld->zero, "");
-
-   {
-      struct lp_build_flow_context *flow_ctx;
-      struct lp_build_if_state if_ctx;
-
-      flow_ctx = lp_build_flow_create(bld->builder);
-      lp_build_flow_scope_begin(flow_ctx);
-
-      *face_s = bld->coord_bld.undef;
-      *face_t = bld->coord_bld.undef;
-      *face = bld->int_bld.undef;
-
-      lp_build_name(*face_s, "face_s");
-      lp_build_name(*face_t, "face_t");
-      lp_build_name(*face, "face");
-
-      lp_build_flow_scope_declare(flow_ctx, face_s);
-      lp_build_flow_scope_declare(flow_ctx, face_t);
-      lp_build_flow_scope_declare(flow_ctx, face);
-
-      lp_build_if(&if_ctx, flow_ctx, bld->builder, arx_ge_ary_arz);
-      {
-         /* +/- X face */
-         LLVMValueRef sign = lp_build_sgn(float_bld, rx);
-         LLVMValueRef ima = lp_build_cube_ima(coord_bld, s);
-         *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
-         *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
-         *face = lp_build_cube_face(bld, rx,
-                                    PIPE_TEX_FACE_POS_X,
-                                    PIPE_TEX_FACE_NEG_X);
-      }
-      lp_build_else(&if_ctx);
-      {
-         struct lp_build_flow_context *flow_ctx2;
-         struct lp_build_if_state if_ctx2;
-
-         LLVMValueRef face_s2 = bld->coord_bld.undef;
-         LLVMValueRef face_t2 = bld->coord_bld.undef;
-         LLVMValueRef face2 = bld->int_bld.undef;
-
-         flow_ctx2 = lp_build_flow_create(bld->builder);
-         lp_build_flow_scope_begin(flow_ctx2);
-         lp_build_flow_scope_declare(flow_ctx2, &face_s2);
-         lp_build_flow_scope_declare(flow_ctx2, &face_t2);
-         lp_build_flow_scope_declare(flow_ctx2, &face2);
-
-         ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, "");
-
-         lp_build_if(&if_ctx2, flow_ctx2, bld->builder, ary_ge_arx_arz);
-         {
-            /* +/- Y face */
-            LLVMValueRef sign = lp_build_sgn(float_bld, ry);
-            LLVMValueRef ima = lp_build_cube_ima(coord_bld, t);
-            face_s2 = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
-            face_t2 = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
-            face2 = lp_build_cube_face(bld, ry,
-                                       PIPE_TEX_FACE_POS_Y,
-                                       PIPE_TEX_FACE_NEG_Y);
-         }
-         lp_build_else(&if_ctx2);
-         {
-            /* +/- Z face */
-            LLVMValueRef sign = lp_build_sgn(float_bld, rz);
-            LLVMValueRef ima = lp_build_cube_ima(coord_bld, r);
-            face_s2 = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
-            face_t2 = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
-            face2 = lp_build_cube_face(bld, rz,
-                                       PIPE_TEX_FACE_POS_Z,
-                                       PIPE_TEX_FACE_NEG_Z);
-         }
-         lp_build_endif(&if_ctx2);
-         lp_build_flow_scope_end(flow_ctx2);
-         lp_build_flow_destroy(flow_ctx2);
-         *face_s = face_s2;
-         *face_t = face_t2;
-         *face = face2;
-      }
-
-      lp_build_endif(&if_ctx);
-      lp_build_flow_scope_end(flow_ctx);
-      lp_build_flow_destroy(flow_ctx);
-   }
-}
-
-
-
 /**
  * Sample the texture/mipmap using given image filter and mip filter.
  * data0_ptr and data1_ptr point to the two mipmap levels to sample
@@ -1605,7 +892,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
    const unsigned mag_filter = bld->static_state->mag_img_filter;
    const int dims = texture_dims(bld->static_state->target);
    LLVMValueRef lod = NULL, lod_fpart = NULL;
-   LLVMValueRef ilevel0, ilevel1 = NULL, ilevel0_vec, ilevel1_vec = NULL;
+   LLVMValueRef ilevel0, ilevel1 = NULL;
    LLVMValueRef width0_vec = NULL, height0_vec = NULL, depth0_vec = NULL;
    LLVMValueRef width1_vec = NULL, height1_vec = NULL, depth1_vec = NULL;
    LLVMValueRef row_stride0_vec = NULL, row_stride1_vec = NULL;
@@ -1685,47 +972,15 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       }
    }
 
-   /*
-    * Convert scalar integer mipmap levels into vectors.
-    */
-   ilevel0_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel0);
-   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR)
-      ilevel1_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel1);
-
-   /*
-    * Compute width, height at mipmap level 'ilevel0'
-    */
-   width0_vec = lp_build_minify(bld, width_vec, ilevel0_vec);
-   if (dims >= 2) {
-      height0_vec = lp_build_minify(bld, height_vec, ilevel0_vec);
-      row_stride0_vec = lp_build_get_level_stride_vec(bld, row_stride_array,
-                                                      ilevel0);
-      if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) {
-         img_stride0_vec = lp_build_get_level_stride_vec(bld,
-                                                         img_stride_array,
-                                                         ilevel0);
-         if (dims == 3) {
-            depth0_vec = lp_build_minify(bld, depth_vec, ilevel0_vec);
-         }
-      }
-   }
-   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-      /* compute width, height, depth for second mipmap level at 'ilevel1' */
-      width1_vec = lp_build_minify(bld, width_vec, ilevel1_vec);
-      if (dims >= 2) {
-         height1_vec = lp_build_minify(bld, height_vec, ilevel1_vec);
-         row_stride1_vec = lp_build_get_level_stride_vec(bld, row_stride_array,
-                                                         ilevel1);
-         if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) {
-            img_stride1_vec = lp_build_get_level_stride_vec(bld,
-                                                            img_stride_array,
-                                                            ilevel1);
-            if (dims ==3) {
-               depth1_vec = lp_build_minify(bld, depth_vec, ilevel1_vec);
-            }
-         }
-      }
-   }
+   /* compute image size(s) of source mipmap level(s) */
+   lp_build_mipmap_level_sizes(bld, dims, width_vec, height_vec, depth_vec,
+                               ilevel0, ilevel1,
+                               row_stride_array, img_stride_array,
+                               &width0_vec, &width1_vec,
+                               &height0_vec, &height1_vec,
+                               &depth0_vec, &depth1_vec,
+                               &row_stride0_vec, &row_stride1_vec,
+                               &img_stride0_vec, &img_stride1_vec);
 
    /*
     * Get pointer(s) to image data for mipmap level(s).
@@ -1803,258 +1058,6 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
 }
 
 
-
-static void
-lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
-                              LLVMValueRef s,
-                              LLVMValueRef t,
-                              LLVMValueRef width,
-                              LLVMValueRef height,
-                              LLVMValueRef stride_array,
-                              LLVMValueRef data_array,
-                              LLVMValueRef texel_out[4])
-{
-   LLVMBuilderRef builder = bld->builder;
-   struct lp_build_context i32, h16, u8n;
-   LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
-   LLVMValueRef i32_c8, i32_c128, i32_c255;
-   LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
-   LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi;
-   LLVMValueRef data_ptr;
-   LLVMValueRef x_stride, y_stride;
-   LLVMValueRef x_offset0, x_offset1;
-   LLVMValueRef y_offset0, y_offset1;
-   LLVMValueRef offset[2][2];
-   LLVMValueRef x_subcoord[2], y_subcoord[2];
-   LLVMValueRef neighbors_lo[2][2];
-   LLVMValueRef neighbors_hi[2][2];
-   LLVMValueRef packed, packed_lo, packed_hi;
-   LLVMValueRef unswizzled[4];
-   const unsigned level = 0;
-   unsigned i, j;
-
-   assert(bld->static_state->target == PIPE_TEXTURE_2D
-         || bld->static_state->target == PIPE_TEXTURE_RECT);
-   assert(bld->static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR);
-   assert(bld->static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR);
-   assert(bld->static_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE);
-
-   lp_build_context_init(&i32, builder, lp_type_int_vec(32));
-   lp_build_context_init(&h16, builder, lp_type_ufixed(16));
-   lp_build_context_init(&u8n, builder, lp_type_unorm(8));
-
-   i32_vec_type = lp_build_vec_type(i32.type);
-   h16_vec_type = lp_build_vec_type(h16.type);
-   u8n_vec_type = lp_build_vec_type(u8n.type);
-
-   if (bld->static_state->normalized_coords) {
-      LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type);
-      LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width, coord_vec_type, "");
-      LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height, coord_vec_type, "");
-      s = lp_build_mul(&bld->coord_bld, s, fp_width);
-      t = lp_build_mul(&bld->coord_bld, t, fp_height);
-   }
-
-   /* scale coords by 256 (8 fractional bits) */
-   s = lp_build_mul_imm(&bld->coord_bld, s, 256);
-   t = lp_build_mul_imm(&bld->coord_bld, t, 256);
-
-   /* convert float to int */
-   s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
-   t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
-
-   /* subtract 0.5 (add -128) */
-   i32_c128 = lp_build_const_int_vec(i32.type, -128);
-   s = LLVMBuildAdd(builder, s, i32_c128, "");
-   t = LLVMBuildAdd(builder, t, i32_c128, "");
-
-   /* compute floor (shift right 8) */
-   i32_c8 = lp_build_const_int_vec(i32.type, 8);
-   s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
-   t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
-
-   /* compute fractional part (AND with 0xff) */
-   i32_c255 = lp_build_const_int_vec(i32.type, 255);
-   s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
-   t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
-
-   x_stride = lp_build_const_vec(bld->uint_coord_bld.type,
-                                 bld->format_desc->block.bits/8);
-
-   y_stride = lp_build_get_const_level_stride_vec(bld, stride_array, level);
-
-   lp_build_sample_wrap_linear_int(bld,
-                                   bld->format_desc->block.width,
-                                   s_ipart, width, x_stride,
-                                   bld->static_state->pot_width,
-                                   bld->static_state->wrap_s,
-                                   &x_offset0, &x_offset1,
-                                   &x_subcoord[0], &x_subcoord[1]);
-   lp_build_sample_wrap_linear_int(bld,
-                                   bld->format_desc->block.height,
-                                   t_ipart, height, y_stride,
-                                   bld->static_state->pot_height,
-                                   bld->static_state->wrap_t,
-                                   &y_offset0, &y_offset1,
-                                   &y_subcoord[0], &y_subcoord[1]);
-
-   offset[0][0] = lp_build_add(&bld->uint_coord_bld, x_offset0, y_offset0);
-   offset[0][1] = lp_build_add(&bld->uint_coord_bld, x_offset1, y_offset0);
-   offset[1][0] = lp_build_add(&bld->uint_coord_bld, x_offset0, y_offset1);
-   offset[1][1] = lp_build_add(&bld->uint_coord_bld, x_offset1, y_offset1);
-
-   /*
-    * Transform 4 x i32 in
-    *
-    *   s_fpart = {s0, s1, s2, s3}
-    *
-    * into 8 x i16
-    *
-    *   s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
-    *
-    * into two 8 x i16
-    *
-    *   s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
-    *   s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
-    *
-    * and likewise for t_fpart. There is no risk of loosing precision here
-    * since the fractional parts only use the lower 8bits.
-    */
-
-   s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
-   t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
-
-   {
-      LLVMTypeRef elem_type = LLVMInt32Type();
-      LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
-      LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
-      LLVMValueRef shuffle_lo;
-      LLVMValueRef shuffle_hi;
-
-      for(j = 0; j < h16.type.length; j += 4) {
-#ifdef PIPE_ARCH_LITTLE_ENDIAN
-         unsigned subindex = 0;
-#else
-         unsigned subindex = 1;
-#endif
-         LLVMValueRef index;
-
-         index = LLVMConstInt(elem_type, j/2 + subindex, 0);
-         for(i = 0; i < 4; ++i)
-            shuffles_lo[j + i] = index;
-
-         index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
-         for(i = 0; i < 4; ++i)
-            shuffles_hi[j + i] = index;
-      }
-
-      shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
-      shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
-
-      s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, shuffle_lo, "");
-      t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_lo, "");
-      s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, shuffle_hi, "");
-      t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_hi, "");
-   }
-
-   /*
-    * get pointer to mipmap level 0 data
-    */
-   data_ptr = lp_build_get_const_mipmap_level(bld, data_array, level);
-
-   /*
-    * Fetch the pixels as 4 x 32bit (rgba order might differ):
-    *
-    *   rgba0 rgba1 rgba2 rgba3
-    *
-    * bit cast them into 16 x u8
-    *
-    *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
-    *
-    * unpack them into two 8 x i16:
-    *
-    *   r0 g0 b0 a0 r1 g1 b1 a1
-    *   r2 g2 b2 a2 r3 g3 b3 a3
-    *
-    * The higher 8 bits of the resulting elements will be zero.
-    */
-
-   for (j = 0; j < 2; ++j) {
-      for (i = 0; i < 2; ++i) {
-         LLVMValueRef rgba8;
-
-         if (util_format_is_rgba8_variant(bld->format_desc)) {
-            /*
-             * Given the format is a rgba8, just read the pixels as is,
-             * without any swizzling. Swizzling will be done later.
-             */
-            rgba8 = lp_build_gather(bld->builder,
-                                    bld->texel_type.length,
-                                    bld->format_desc->block.bits,
-                                    bld->texel_type.width,
-                                    data_ptr, offset[j][i]);
-
-            rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
-
-         }
-         else {
-            rgba8 = lp_build_fetch_rgba_aos(bld->builder,
-                                            bld->format_desc,
-                                            u8n.type,
-                                            data_ptr, offset[j][i],
-                                            x_subcoord[i],
-                                            y_subcoord[j]);
-         }
-
-         lp_build_unpack2(builder, u8n.type, h16.type,
-                          rgba8,
-                          &neighbors_lo[j][i], &neighbors_hi[j][i]);
-      }
-   }
-
-   /*
-    * Linear interpolate with 8.8 fixed point.
-    */
-
-   packed_lo = lp_build_lerp_2d(&h16,
-                                s_fpart_lo, t_fpart_lo,
-                                neighbors_lo[0][0],
-                                neighbors_lo[0][1],
-                                neighbors_lo[1][0],
-                                neighbors_lo[1][1]);
-
-   packed_hi = lp_build_lerp_2d(&h16,
-                                s_fpart_hi, t_fpart_hi,
-                                neighbors_hi[0][0],
-                                neighbors_hi[0][1],
-                                neighbors_hi[1][0],
-                                neighbors_hi[1][1]);
-
-   packed = lp_build_pack2(builder, h16.type, u8n.type, packed_lo, packed_hi);
-
-   /*
-    * Convert to SoA and swizzle.
-    */
-
-   lp_build_rgba8_to_f32_soa(bld->builder,
-                             bld->texel_type,
-                             packed, unswizzled);
-
-   if (util_format_is_rgba8_variant(bld->format_desc)) {
-      lp_build_format_swizzle_soa(bld->format_desc,
-                                  &bld->texel_bld,
-                                  unswizzled, texel_out);
-   } else {
-      texel_out[0] = unswizzled[0];
-      texel_out[1] = unswizzled[1];
-      texel_out[2] = unswizzled[2];
-      texel_out[3] = unswizzled[3];
-   }
-
-   apply_sampler_swizzle(bld, texel_out);
-}
-
-
 static void
 lp_build_sample_compare(struct lp_build_sample_context *bld,
                         LLVMValueRef p,
@@ -2181,6 +1184,7 @@ lp_build_sample_soa(LLVMBuilderRef builder,
    t = coords[1];
    r = coords[2];
 
+   /* width, height, depth as uint vectors */
    width_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, width);
    height_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, height);
    depth_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, depth);
@@ -2190,27 +1194,32 @@ lp_build_sample_soa(LLVMBuilderRef builder,
       lp_build_sample_nop(&bld, texel_out);
    }
    else if (util_format_fits_8unorm(bld.format_desc) &&
-            (static_state->target == PIPE_TEXTURE_2D ||
-             static_state->target == PIPE_TEXTURE_RECT) &&
-            static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR &&
-            static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR &&
-            static_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
-            is_simple_wrap_mode(static_state->wrap_s) &&
-            is_simple_wrap_mode(static_state->wrap_t)) {
-      /* special case */
-      lp_build_sample_2d_linear_aos(&bld, s, t, width_vec, height_vec,
-                                    row_stride_array, data_array, texel_out);
+            lp_is_simple_wrap_mode(static_state->wrap_s) &&
+            lp_is_simple_wrap_mode(static_state->wrap_t)) {
+      /* do sampling/filtering with fixed pt arithmetic */
+      printf("new sample\n");
+      lp_build_sample_aos(&bld, unit, s, t, r, ddx, ddy,
+                          lod_bias, explicit_lod,
+                          width, height, depth,
+                          width_vec, height_vec, depth_vec,
+                          row_stride_array, img_stride_array,
+                          data_array, texel_out);
    }
+
    else {
-      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
-          (static_state->min_img_filter != PIPE_TEX_FILTER_NEAREST ||
-           static_state->mag_img_filter != PIPE_TEX_FILTER_NEAREST ||
-           static_state->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) &&
+      if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
           util_format_fits_8unorm(bld.format_desc)) {
          debug_printf("%s: using floating point linear filtering for %s\n",
                       __FUNCTION__, bld.format_desc->short_name);
+         debug_printf("  min_img %d  mag_img %d  mip %d  wraps %d  wrapt %d\n",
+                      static_state->min_img_filter,
+                      static_state->mag_img_filter,
+                      static_state->min_mip_filter,
+                      static_state->wrap_s,
+                      static_state->wrap_t);
       }
 
+      printf("old sample\n");
       lp_build_sample_general(&bld, unit, s, t, r, ddx, ddy,
                               lod_bias, explicit_lod,
                               width, height, depth,
-- 
2.7.4