From 5171ec9ca92ce489e32c227ae3b4b6df621bbf40 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Fri, 29 Jan 2016 02:49:22 +0100
Subject: [PATCH] gallivm: add PK2H/UP2H support

Add support for these opcodes, the conversion functions were already
there albeit need some new packing stuff.
Just like the tgsi version, piglit won't like it for all the same
reasons, so it's disabled (UP2H passes piglit arb_shader_language_packing
tests, albeit since PK2H won't due those rounding differences I don't
know if that one works or not as the piglit test is rather difficult to
deal with).
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c        | 16 ++++-
 src/gallium/auxiliary/gallivm/lp_bld_pack.c        | 26 ++++++++
 src/gallium/auxiliary/gallivm/lp_bld_pack.h        |  5 ++
 src/gallium/auxiliary/gallivm/lp_bld_tgsi.c        |  1 -
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c | 73 ++++++++++++++++++++++
 5 files changed, 119 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 7854142..7cf0dee 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -130,6 +130,7 @@ lp_build_half_to_float(struct gallivm_state *gallivm,
  *
  * Convert float32 to half floats, preserving Infs and NaNs,
  * with rounding towards zero (trunc).
+ * XXX: For GL, would prefer rounding towards nearest(-even).
  */
 LLVMValueRef
 lp_build_float_to_half(struct gallivm_state *gallivm,
@@ -143,6 +144,15 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
    struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
    LLVMValueRef result;
 
+   /*
+    * Note: Newer llvm versions (3.6 or so) support fptrunc to 16 bits
+    * directly, without any (x86 or generic) intrinsics.
+    * Albeit the rounding mode cannot be specified (and is undefined,
+    * though in practice on x86 seems to do nearest-even but it may
+    * be dependent on instruction set support), so is essentially
+    * useless.
+    */
+
    if (util_cpu_caps.has_f16c &&
        (length == 4 || length == 8)) {
       struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
@@ -187,7 +197,11 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
         LLVMValueRef index = LLVMConstInt(i32t, i, 0);
         LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, "");
 #if 0
-        /* XXX: not really supported by backends */
+        /*
+         * XXX: not really supported by backends.
+         * Even if they would now, rounding mode cannot be specified and
+         * is undefined.
+         */
         LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32);
 #else
         LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index 0b0f7f0..daa2043 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -257,6 +257,32 @@ lp_build_concat_n(struct gallivm_state *gallivm,
 
 
 /**
+ * Un-interleave vector.
+ * This will return a vector consisting of every second element
+ * (depending on lo_hi, beginning at 0 or 1).
+ * The returned vector size (elems and width) will only be half
+ * that of the source vector.
+ */
+LLVMValueRef
+lp_build_uninterleave1(struct gallivm_state *gallivm,
+                       unsigned num_elems,
+                       LLVMValueRef a,
+                       unsigned lo_hi)
+{
+   LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+   assert(num_elems <= LP_MAX_VECTOR_LENGTH);
+
+   for(i = 0; i < num_elems / 2; ++i)
+      elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
+
+   shuffle = LLVMConstVector(elems, num_elems / 2);
+
+   return LLVMBuildShuffleVector(gallivm->builder, a, a, shuffle, "");
+}
+
+
+/**
  * Interleave vector elements.
  *
  * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
index 7cede35..367fba1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -58,6 +58,11 @@ lp_build_interleave2(struct gallivm_state *gallivm,
                      LLVMValueRef b,
                      unsigned lo_hi);
 
+LLVMValueRef
+lp_build_uninterleave1(struct gallivm_state *gallivm,
+                       unsigned num_elems,
+                       LLVMValueRef a,
+                       unsigned lo_hi);
 
 void
 lp_build_unpack2(struct gallivm_state *gallivm,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
index c88dfbf..1cbe47c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@@ -248,7 +248,6 @@ lp_build_tgsi_inst_llvm(
    /* Ignore deprecated instructions */
    switch (inst->Instruction.Opcode) {
 
-   case TGSI_OPCODE_UP2H:
    case TGSI_OPCODE_UP2US:
    case TGSI_OPCODE_UP4B:
    case TGSI_OPCODE_UP4UB:
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
index 6f75bec..f6b42ee 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -45,8 +45,10 @@
 #include "lp_bld_arit.h"
 #include "lp_bld_bitarit.h"
 #include "lp_bld_const.h"
+#include "lp_bld_conv.h"
 #include "lp_bld_gather.h"
 #include "lp_bld_logic.h"
+#include "lp_bld_pack.h"
 
 #include "tgsi/tgsi_exec.h"
 
@@ -530,6 +532,75 @@ static struct lp_build_tgsi_action log_action = {
    log_emit	 /* emit */
 };
 
+/* TGSI_OPCODE_PK2H */
+
+static void
+pk2h_fetch_args(
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   /* src0.x */
+   emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
+                                            0, TGSI_CHAN_X);
+   /* src0.y */
+   emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
+                                            0, TGSI_CHAN_Y);
+}
+
+static void
+pk2h_emit(const struct lp_build_tgsi_action *action,
+          struct lp_build_tgsi_context *bld_base,
+          struct lp_build_emit_data *emit_data)
+{
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   struct lp_type f16i_t;
+   LLVMValueRef lo, hi, res;
+
+   f16i_t = lp_type_uint_vec(16, bld_base->base.type.length * 32);
+   lo = lp_build_float_to_half(gallivm, emit_data->args[0]);
+   hi = lp_build_float_to_half(gallivm, emit_data->args[1]);
+   /* maybe some interleave doubling vector width would be useful... */
+   lo = lp_build_pad_vector(gallivm, lo, bld_base->base.type.length * 2);
+   hi = lp_build_pad_vector(gallivm, hi, bld_base->base.type.length * 2);
+   res = lp_build_interleave2(gallivm, f16i_t, lo, hi, 0);
+
+   emit_data->output[emit_data->chan] = res;
+}
+
+static struct lp_build_tgsi_action pk2h_action = {
+   pk2h_fetch_args, /* fetch_args */
+   pk2h_emit        /* emit */
+};
+
+/* TGSI_OPCODE_UP2H */
+
+static void
+up2h_emit(const struct lp_build_tgsi_action *action,
+          struct lp_build_tgsi_context *bld_base,
+          struct lp_build_emit_data *emit_data)
+{
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMContextRef context = gallivm->context;
+   LLVMValueRef lo, hi, res[2], arg;
+   unsigned nr = bld_base->base.type.length;
+   LLVMTypeRef i16t = LLVMVectorType(LLVMInt16TypeInContext(context), nr * 2);
+
+   arg = LLVMBuildBitCast(builder, emit_data->args[0], i16t, "");
+   lo = lp_build_uninterleave1(gallivm, nr * 2, arg, 0);
+   hi = lp_build_uninterleave1(gallivm, nr * 2, arg, 1);
+   res[0] = lp_build_half_to_float(gallivm, lo);
+   res[1] = lp_build_half_to_float(gallivm, hi);
+
+   emit_data->output[0] = emit_data->output[2] = res[0];
+   emit_data->output[1] = emit_data->output[3] = res[1];
+}
+
+static struct lp_build_tgsi_action up2h_action = {
+   scalar_unary_fetch_args, /* fetch_args */
+   up2h_emit                /* emit */
+};
+
 /* TGSI_OPCODE_LRP */
 
 static void
@@ -1032,10 +1103,12 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
    bld_base->op_actions[TGSI_OPCODE_EXP] = exp_action;
    bld_base->op_actions[TGSI_OPCODE_LIT] = lit_action;
    bld_base->op_actions[TGSI_OPCODE_LOG] = log_action;
+   bld_base->op_actions[TGSI_OPCODE_PK2H] = pk2h_action;
    bld_base->op_actions[TGSI_OPCODE_RSQ] = rsq_action;
    bld_base->op_actions[TGSI_OPCODE_SQRT] = sqrt_action;
    bld_base->op_actions[TGSI_OPCODE_POW] = pow_action;
    bld_base->op_actions[TGSI_OPCODE_SCS] = scs_action;
+   bld_base->op_actions[TGSI_OPCODE_UP2H] = up2h_action;
    bld_base->op_actions[TGSI_OPCODE_XPD] = xpd_action;
 
    bld_base->op_actions[TGSI_OPCODE_BREAKC].fetch_args = scalar_unary_fetch_args;
-- 
2.7.4