From 3c98e3cd63012246346e6054c5c16d368f899062 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Sun, 11 Dec 2016 23:41:07 +0100
Subject: [PATCH] gallivm: provide soa fetch path handling formats with more
 than 32bit

This previously always fell back to AoS conversion. Even for 4-float formats
(which is the optimal case by far for that fallback case) this was suboptimal,
since it meant the conversion couldn't be done with 256bit vectors. While this
may still only be partly possible for some formats, (unless there's AVX2
support) at least the transpose can be done with half the unpacks
(and before using the transpose for AoS fallbacks, it was worse still).
With less than 4 channels, things got way worse with the AoS fallback
quickly even with 128bit vectors.
The strategy is pretty much the same as the existing one for formats
which fit into 32 bits, except there's now multiple vectors to be
fetched (2 or 4 to be exact), which need to be shuffled first (if it's 4
vectors, this amounts to a transpose, for 2 it's a bit different),
then the unpack is done the same (with the exception that the shift
of the channels is now modulo 32, and we need to select the right
vector).
In fact the most complex part about it is to get the shuffles right
for separating into lo/hi parts for AVX/AVX2...
This also makes use of the new ability of gather to use provided type
information, which we abuse to outsmart llvm so we get decent shuffles,
and to fetch 3x32bit vectors without having to ZExt the scalar.
And just because we can, we handle double formats too, albeit they are
a bit different (draw sometimes needs to handle that).
v2: fix typo float/int bug (generating inefficient code).

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/auxiliary/gallivm/lp_bld_format_soa.c | 529 +++++++++++++++-------
 1 file changed, 375 insertions(+), 154 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index b3bc155..34da869 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -31,6 +31,7 @@
 #include "util/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_string.h"
+#include "util/u_math.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -113,6 +114,166 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
 }
 
 
+
+static LLVMValueRef
+lp_build_extract_soa_chan(struct lp_build_context *bld,
+                          unsigned blockbits,
+                          boolean srgb_chan,
+                          struct util_format_channel_description chan_desc,
+                          LLVMValueRef packed)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_type type = bld->type;
+   LLVMValueRef input = packed;
+   const unsigned width = chan_desc.size;
+   const unsigned start = chan_desc.shift;
+   const unsigned stop = start + width;
+
+   /* Decode the input vector component */
+
+   switch(chan_desc.type) {
+   case UTIL_FORMAT_TYPE_VOID:
+      input = bld->undef;
+      break;
+
+   case UTIL_FORMAT_TYPE_UNSIGNED:
+      /*
+       * Align the LSB
+       */
+      if (start) {
+         input = LLVMBuildLShr(builder, input,
+                               lp_build_const_int_vec(gallivm, type, start), "");
+      }
+
+      /*
+       * Zero the MSBs
+       */
+      if (stop < blockbits) {
+         unsigned mask = ((unsigned long long)1 << width) - 1;
+         input = LLVMBuildAnd(builder, input,
+                              lp_build_const_int_vec(gallivm, type, mask), "");
+      }
+
+      /*
+       * Type conversion
+       */
+      if (type.floating) {
+         if (srgb_chan) {
+            struct lp_type conv_type = lp_uint_type(type);
+            input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
+         }
+         else {
+            if(chan_desc.normalized)
+               input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
+            else
+               input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
+         }
+      }
+      else if (chan_desc.pure_integer) {
+         /* Nothing to do */
+      } else {
+          /* FIXME */
+          assert(0);
+      }
+      break;
+
+   case UTIL_FORMAT_TYPE_SIGNED:
+      /*
+       * Align the sign bit first.
+       */
+      if (stop < type.width) {
+         unsigned bits = type.width - stop;
+         LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
+         input = LLVMBuildShl(builder, input, bits_val, "");
+      }
+
+      /*
+       * Align the LSB (with an arithmetic shift to preserve the sign)
+       */
+      if (chan_desc.size < type.width) {
+         unsigned bits = type.width - chan_desc.size;
+         LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
+         input = LLVMBuildAShr(builder, input, bits_val, "");
+      }
+
+      /*
+       * Type conversion
+       */
+      if (type.floating) {
+         input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
+         if (chan_desc.normalized) {
+            double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);
+            LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
+            input = LLVMBuildFMul(builder, input, scale_val, "");
+            /*
+             * The formula above will produce value below -1.0 for most negative
+             * value but everything seems happy with that hence disable for now.
+             */
+            if (0)
+               input = lp_build_max(bld, input,
+                                    lp_build_const_vec(gallivm, type, -1.0f));
+         }
+      }
+      else if (chan_desc.pure_integer) {
+         /* Nothing to do */
+      } else {
+          /* FIXME */
+          assert(0);
+      }
+      break;
+
+   case UTIL_FORMAT_TYPE_FLOAT:
+      if (type.floating) {
+         if (chan_desc.size == 16) {
+            struct lp_type f16i_type = type;
+            f16i_type.width /= 2;
+            f16i_type.floating = 0;
+            if (start) {
+               input = LLVMBuildLShr(builder, input,
+                                     lp_build_const_int_vec(gallivm, type, start), "");
+            }
+            input = LLVMBuildTrunc(builder, input,
+                                   lp_build_vec_type(gallivm, f16i_type), "");
+            input = lp_build_half_to_float(gallivm, input);
+         } else {
+            assert(start == 0);
+            assert(stop == 32);
+            assert(type.width == 32);
+         }
+         input = LLVMBuildBitCast(builder, input, bld->vec_type, "");
+      }
+      else {
+         /* FIXME */
+         assert(0);
+         input = bld->undef;
+      }
+      break;
+
+   case UTIL_FORMAT_TYPE_FIXED:
+      if (type.floating) {
+         double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1);
+         LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
+         input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
+         input = LLVMBuildFMul(builder, input, scale_val, "");
+      }
+      else {
+         /* FIXME */
+         assert(0);
+         input = bld->undef;
+      }
+      break;
+
+   default:
+      assert(0);
+      input = bld->undef;
+      break;
+   }
+
+   return input;
+}
+
+
 /**
  * Unpack several pixels in SoA.
  *
@@ -143,7 +304,6 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
                          LLVMValueRef packed,
                          LLVMValueRef rgba_out[4])
 {
-   LLVMBuilderRef builder = gallivm->builder;
    struct lp_build_context bld;
    LLVMValueRef inputs[4];
    unsigned chan;
@@ -159,162 +319,19 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
 
    /* Decode the input vector components */
    for (chan = 0; chan < format_desc->nr_channels; ++chan) {
-      const unsigned width = format_desc->channel[chan].size;
-      const unsigned start = format_desc->channel[chan].shift;
-      const unsigned stop = start + width;
-      LLVMValueRef input;
-
-      input = packed;
-
-      switch(format_desc->channel[chan].type) {
-      case UTIL_FORMAT_TYPE_VOID:
-         input = lp_build_undef(gallivm, type);
-         break;
-
-      case UTIL_FORMAT_TYPE_UNSIGNED:
-         /*
-          * Align the LSB
-          */
-
-         if (start) {
-            input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(gallivm, type, start), "");
-         }
-
-         /*
-          * Zero the MSBs
-          */
-
-         if (stop < format_desc->block.bits) {
-            unsigned mask = ((unsigned long long)1 << width) - 1;
-            input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(gallivm, type, mask), "");
-         }
-
-         /*
-          * Type conversion
-          */
-
-         if (type.floating) {
-            if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
-               if (format_desc->swizzle[3] == chan) {
-                  input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
-               }
-               else {
-                  struct lp_type conv_type = lp_uint_type(type);
-                  input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
-               }
-            }
-            else {
-               if(format_desc->channel[chan].normalized)
-                  input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
-               else
-                  input = LLVMBuildSIToFP(builder, input,
-                                          lp_build_vec_type(gallivm, type), "");
-            }
-         }
-         else if (format_desc->channel[chan].pure_integer) {
-            /* Nothing to do */
-         } else {
-             /* FIXME */
-             assert(0);
-         }
-
-         break;
-
-      case UTIL_FORMAT_TYPE_SIGNED:
-         /*
-          * Align the sign bit first.
-          */
-
-         if (stop < type.width) {
-            unsigned bits = type.width - stop;
-            LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
-            input = LLVMBuildShl(builder, input, bits_val, "");
-         }
+      struct util_format_channel_description chan_desc = format_desc->channel[chan];
+      boolean srgb_chan = FALSE;
 
-         /*
-          * Align the LSB (with an arithmetic shift to preserve the sign)
-          */
-
-         if (format_desc->channel[chan].size < type.width) {
-            unsigned bits = type.width - format_desc->channel[chan].size;
-            LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
-            input = LLVMBuildAShr(builder, input, bits_val, "");
-         }
-
-         /*
-          * Type conversion
-          */
-
-         if (type.floating) {
-            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
-            if (format_desc->channel[chan].normalized) {
-               double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1);
-               LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
-               input = LLVMBuildFMul(builder, input, scale_val, "");
-               /* the formula above will produce value below -1.0 for most negative
-                * value but everything seems happy with that hence disable for now */
-               if (0)
-                  input = lp_build_max(&bld, input,
-                                       lp_build_const_vec(gallivm, type, -1.0f));
-            }
-         }
-         else if (format_desc->channel[chan].pure_integer) {
-            /* Nothing to do */
-         } else {
-             /* FIXME */
-             assert(0);
-         }
-
-         break;
-
-      case UTIL_FORMAT_TYPE_FLOAT:
-         if (type.floating) {
-            if (format_desc->channel[chan].size == 16) {
-               struct lp_type f16i_type = type;
-               f16i_type.width /= 2;
-               f16i_type.floating = 0;
-               if (start) {
-                  input = LLVMBuildLShr(builder, input,
-                             lp_build_const_int_vec(gallivm, type, start), "");
-               }
-               input = LLVMBuildTrunc(builder, input,
-                                      lp_build_vec_type(gallivm, f16i_type), "");
-               input = lp_build_half_to_float(gallivm, input);
-            } else {
-               assert(start == 0);
-               assert(stop == 32);
-               assert(type.width == 32);
-            }
-            input = LLVMBuildBitCast(builder, input, lp_build_vec_type(gallivm, type), "");
-         }
-         else {
-            /* FIXME */
-            assert(0);
-            input = lp_build_undef(gallivm, type);
-         }
-         break;
-
-      case UTIL_FORMAT_TYPE_FIXED:
-         if (type.floating) {
-            double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1);
-            LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
-            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
-            input = LLVMBuildFMul(builder, input, scale_val, "");
-         }
-         else {
-            /* FIXME */
-            assert(0);
-            input = lp_build_undef(gallivm, type);
-         }
-         break;
-
-      default:
-         assert(0);
-         input = lp_build_undef(gallivm, type);
-         break;
+      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
+          format_desc->swizzle[3] != chan) {
+         srgb_chan = TRUE;
       }
 
-      inputs[chan] = input;
+      inputs[chan] = lp_build_extract_soa_chan(&bld,
+                                               format_desc->block.bits,
+                                               srgb_chan,
+                                               chan_desc,
+                                               packed);
    }
 
    lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
@@ -450,6 +467,210 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
       return;
    }
 
+
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
+       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
+       format_desc->block.width == 1 &&
+       format_desc->block.height == 1 &&
+       format_desc->block.bits > type.width &&
+       ((format_desc->block.bits <= type.width * type.length &&
+         format_desc->channel[0].size <= type.width) ||
+        (format_desc->channel[0].size == 64 &&
+         format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
+         type.floating)))
+   {
+      /*
+       * Similar to above, but the packed pixel is larger than what fits
+       * into an element of the destination format. The packed pixels will be
+       * shuffled into SoA vectors appropriately, and then the extraction will
+       * be done in parallel as much as possible.
+       * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
+       * the gathered vectors can be shuffled easily (even with avx).
+       * 64xn float -> 32xn float is handled too but it's a bit special as
+       * it does the conversion pre-shuffle.
+       */
+
+      LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
+      struct lp_type fetch_type, gather_type = type;
+      unsigned num_gather, fetch_width, i, j;
+      struct lp_build_context bld;
+      boolean fp64 = format_desc->channel[0].size == 64;
+
+      lp_build_context_init(&bld, gallivm, type);
+
+      assert(type.width == 32);
+      assert(format_desc->block.bits > type.width);
+
+      /*
+       * First, figure out fetch order.
+       */
+      fetch_width = util_next_power_of_two(format_desc->block.bits);
+      num_gather = fetch_width / type.width;
+      /*
+       * fp64 are treated like fp32 except we fetch twice wide values
+       * (as we shuffle after trunc). The shuffles for that work out
+       * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
+       * albeit we miss the potential opportunity for hw gather (as it
+       * only handles native size).
+       */
+      num_gather = fetch_width / type.width;
+      gather_type.width *= num_gather;
+      if (fp64) {
+         num_gather /= 2;
+      }
+      gather_type.length /= num_gather;
+
+      for (i = 0; i < num_gather; i++) {
+         LLVMValueRef offsetr, shuf_vec;
+         if(num_gather == 4) {
+            for (j = 0; j < gather_type.length; j++) {
+               unsigned idx = i + 4*j;
+               shuffles[j] = lp_build_const_int32(gallivm, idx);
+            }
+            shuf_vec = LLVMConstVector(shuffles, gather_type.length);
+            offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
+
+         }
+         else if (num_gather == 2) {
+            assert(num_gather == 2);
+            for (j = 0; j < gather_type.length; j++) {
+               unsigned idx = i*2 + (j%2) + (j/2)*4;
+               shuffles[j] = lp_build_const_int32(gallivm, idx);
+            }
+            shuf_vec = LLVMConstVector(shuffles, gather_type.length);
+            offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
+         }
+         else {
+            assert(num_gather == 1);
+            offsetr = offset;
+         }
+         if (gather_type.length == 1) {
+            LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
+            offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
+         }
+
+         /*
+          * Determine whether to use float or int loads. This is mostly
+          * to outsmart the (stupid) llvm int/float shuffle logic, we
+          * don't really care much if the data is floats or ints...
+          * But llvm will refuse to use single float shuffle with int data
+          * and instead use 3 int shuffles instead, the code looks atrocious.
+          * (Note bitcasts often won't help, as llvm is too smart to be
+          * fooled by that.)
+          * Nobody cares about simd float<->int domain transition penalties,
+          * which usually don't even exist for shuffles anyway.
+          * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
+          * going into transpose, which is unpacks, so doesn't really matter
+          * much).
+          * With 2x32bit or 4x16bit fetch, we use float vec, since those
+          * go into the weird channel separation shuffle. With floats,
+          * this is (with 128bit vectors):
+          * - 2 movq, 2 movhpd, 2 shufps
+          * With ints it would be:
+          * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
+          * I've seen texture functions increase in code size by 15% just due
+          * to that (there's lots of such fetches in them...)
+          * (We could chose a different gather order to improve this somewhat
+          * for the int path, but it would basically just drop the blends,
+          * so the float path with this order really is optimal.)
+          * Albeit it is tricky sometimes llvm doesn't ignore the float->int
+          * casts so must avoid them until we're done with the float shuffle...
+          * 3x16bit formats (the same is also true for 3x8) are pretty bad but
+          * there's nothing we can do about them (we could overallocate by
+          * those couple bytes and use unaligned but pot sized load).
+          * Note that this is very much x86 specific. I don't know if this
+          * affect other archs at all.
+          */
+         if (num_gather > 1) {
+            /*
+             * We always want some float type here (with x86)
+             * due to shuffles being float ones afterwards (albeit for
+             * the num_gather == 4 case int should work fine too
+             * (unless there's some problems with avx but not avx2).
+             */
+            if (format_desc->channel[0].size == 64) {
+               fetch_type = lp_type_float_vec(64, gather_type.width);
+            } else {
+               fetch_type = lp_type_int_vec(32, gather_type.width);
+            }
+         }
+         else {
+            /* type doesn't matter much */
+            if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
+                (format_desc->channel[0].size == 32 ||
+                 format_desc->channel[0].size == 64)) {
+            fetch_type = lp_type_float(gather_type.width);
+            } else {
+               fetch_type = lp_type_uint(gather_type.width);
+            }
+         }
+
+         /* Now finally gather the values */
+         packed[i] = lp_build_gather(gallivm, gather_type.length,
+                                     format_desc->block.bits,
+                                     fetch_type, aligned,
+                                     base_ptr, offsetr, FALSE);
+         if (fp64) {
+            struct lp_type conv_type = type;
+            conv_type.width *= 2;
+            packed[i] = LLVMBuildBitCast(builder, packed[i],
+                                         lp_build_vec_type(gallivm, conv_type), "");
+            packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
+         }
+      }
+
+      /* shuffle the gathered values to SoA */
+      if (num_gather == 2) {
+         for (i = 0; i < num_gather; i++) {
+            for (j = 0; j < type.length; j++) {
+               unsigned idx = (j%2)*2 + (j/4)*4 + i;
+               if ((j/2)%2)
+                  idx += type.length;
+               shuffles[j] = lp_build_const_int32(gallivm, idx);
+            }
+            dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
+                                            LLVMConstVector(shuffles, type.length), "");
+         }
+      }
+      else if (num_gather == 4) {
+         lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
+      }
+      else {
+         assert(num_gather == 1);
+         dst[0] = packed[0];
+      }
+
+      /*
+       * And finally unpack exactly as above, except that
+       * chan shift is adjusted and the right vector selected.
+       */
+      if (!fp64) {
+         for (i = 0; i < num_gather; i++) {
+            dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
+         }
+         for (i = 0; i < format_desc->nr_channels; i++) {
+            struct util_format_channel_description chan_desc = format_desc->channel[i];
+            unsigned blockbits = type.width;
+            unsigned vec_nr = chan_desc.shift / type.width;
+            chan_desc.shift %= type.width;
+
+            output[i] = lp_build_extract_soa_chan(&bld,
+                                                  blockbits,
+                                                  FALSE,
+                                                  chan_desc,
+                                                  dst[vec_nr]);
+         }
+      }
+      else {
+         for (i = 0; i < format_desc->nr_channels; i++)  {
+            output[i] = dst[i];
+         }
+      }
+
+      lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
+      return;
+   }
+
    if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
        format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
       /*
-- 
2.7.4