From 81cfcdbd87940914fc3c59acd0e43c4f6efb3bb7 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Thu, 29 Aug 2013 03:58:18 +0200
Subject: [PATCH] gallivm: don't calculate square root of rho if we use
 accurate rho method

While a sqrt here and there shouldn't hurt much (depending on the cpu) it is
possible to completely omit it since rho is only used for calculating lod and
there log2(x) == 0.5*log2(x^2). Depending on the exact path taken for
calculating lod this means we get a simple mul instead of sqrt (in case of
nearest mip filter in fact we don't need to replace the sqrt with something
else at all), only in some not very useful path this doesn't work (combined
brilinear calculation of int level and fractional lod, accurate rho calc but
brilinear filtering seems odd).
Apart from being faster as an added bonus this should increase our crappy
fractional accuracy of lod, since fast_log2 is only good for ~3bits and this
should increase accuracy by one bit (though not used if dimension is just one
as we'd need an extra mul there as we never had the squared rho in the first
place).

v2: use separate ilog2_sqrt function if we have squared rho.

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c | 113 +++++++++++++++++---------
 1 file changed, 74 insertions(+), 39 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index e1cfd78e..9b0a92c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -232,6 +232,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
    unsigned length = coord_bld->type.length;
    unsigned num_quads = length / 4;
    boolean rho_per_quad = rho_bld->type.length != length;
+   boolean no_rho_opt = (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1);
    unsigned i;
    LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
    LLVMValueRef rho_xvec, rho_yvec;
@@ -264,12 +265,13 @@ lp_build_rho(struct lp_build_sample_context *bld,
       else {
          rho = lp_build_swizzle_scalar_aos(coord_bld, cube_rho, 0, 4);
       }
-      if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
-         rho = lp_build_sqrt(rho_bld, rho);
-      }
       /* Could optimize this for single quad just skip the broadcast */
       cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
                                             rho_bld->type, float_size, index0);
+      if (no_rho_opt) {
+         /* skipping sqrt hence returning rho squared */
+         cubesize = lp_build_mul(rho_bld, cubesize, cubesize);
+      }
       rho = lp_build_mul(rho_bld, cubesize, rho);
    }
    else if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) {
@@ -281,7 +283,11 @@ lp_build_rho(struct lp_build_sample_context *bld,
          floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
                                                coord_bld->type, float_size, indexi);
 
-         if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
+         /*
+          * note that for rho_per_quad case could reduce math (at some shuffle
+          * cost), but for now use same code to per-pixel lod case.
+          */
+         if (no_rho_opt) {
             ddx[i] = lp_build_mul(coord_bld, floatdim, derivs->ddx[i]);
             ddy[i] = lp_build_mul(coord_bld, floatdim, derivs->ddy[i]);
             ddx[i] = lp_build_mul(coord_bld, ddx[i], ddx[i]);
@@ -295,7 +301,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
             ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]);
          }
       }
-      if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
+      if (no_rho_opt) {
          rho_xvec = lp_build_add(coord_bld, ddx[0], ddx[1]);
          rho_yvec = lp_build_add(coord_bld, ddy[0], ddy[1]);
          if (dims > 2) {
@@ -303,19 +309,8 @@ lp_build_rho(struct lp_build_sample_context *bld,
             rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]);
          }
          rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
-
-         if (rho_per_quad) {
-            /*
-             * note for this case without per-pixel lod could reduce math more
-             * (at some shuffle cost), but for now only do sqrt after packing,
-             * otherwise would also need different code to per-pixel lod case.
-             */
-            rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                            rho_bld->type, rho, 0);
-         }
-         rho = lp_build_sqrt(rho_bld, rho);
-
-      }
+         /* skipping sqrt hence returning rho squared */
+     }
       else {
          rho = ddmax[0];
          if (dims > 1) {
@@ -324,13 +319,13 @@ lp_build_rho(struct lp_build_sample_context *bld,
                rho = lp_build_max(coord_bld, rho, ddmax[2]);
             }
          }
-         if (rho_per_quad) {
-            /*
-             * rho_vec contains per-pixel rho, convert to scalar per quad.
-             */
-            rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                            rho_bld->type, rho, 0);
-         }
+      }
+      if (rho_per_quad) {
+         /*
+          * rho_vec contains per-pixel rho, convert to scalar per quad.
+          */
+         rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                         rho_bld->type, rho, 0);
       }
    }
    else {
@@ -362,7 +357,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
          }
       }
 
-      if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
+      if (no_rho_opt) {
          static const unsigned char swizzle01[] = { /* no-op swizzle */
             0, 1,
             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
@@ -407,16 +402,9 @@ lp_build_rho(struct lp_build_sample_context *bld,
                                             rho_bld->type, rho, 0);
          }
          else {
-            /*
-             * on some cpus with half-speed 8-wide sqrt (e.g. SNB but not IVB)
-             * doing pack/sqrt/unpack/swizzle might be better for 8-wide case,
-             * same is true for cpus having faster scalars than 4-wide vecs
-             * for 4-wide case (where pack/unpack would be no-ops anyway).
-             * (Same is true really for cube_rho case above.)
-             */
             rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
          }
-         rho = lp_build_sqrt(rho_bld, rho);
+         /* skipping sqrt hence returning rho squared */
       }
       else {
          ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
@@ -636,7 +624,7 @@ lp_build_brilinear_rho(struct lp_build_context *bld,
 
    /*
     * The pre factor will make the intersections with the exact powers of two
-    * happen precisely where we want then to be, which means that the integer
+    * happen precisely where we want them to be, which means that the integer
     * part will not need any post adjustments.
     */
    rho = lp_build_mul(bld, rho,
@@ -666,6 +654,34 @@ lp_build_brilinear_rho(struct lp_build_context *bld,
 
 
 /**
+ * Fast implementation of iround(log2(sqrt(x))), based on
+ * log2(x^n) == n*log2(x).
+ *
+ * Gives accurate results all the time.
+ * (Could be trivially extended to handle other power-of-two roots.)
+ */
+static LLVMValueRef
+lp_build_ilog2_sqrt(struct lp_build_context *bld,
+                    LLVMValueRef x)
+{
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   LLVMValueRef ipart;
+   struct lp_type i_type = lp_int_type(bld->type);
+   LLVMValueRef one = lp_build_const_int_vec(bld->gallivm, i_type, 1);
+
+   assert(bld->type.floating);
+
+   assert(lp_check_value(bld->type, x));
+
+   /* ipart = log2(x) + 0.5 = 0.5*(log2(x^2) + 1.0) */
+   ipart = lp_build_extract_exponent(bld, x, 1);
+   ipart = LLVMBuildAShr(builder, ipart, one, "");
+
+   return ipart;
+}
+
+
+/**
  * Generate code to compute texture level of detail (lambda).
  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
  * \param lod_bias  optional float vector with the shader lod bias
@@ -740,6 +756,8 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
       }
       else {
          LLVMValueRef rho;
+         boolean rho_squared = (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) &&
+                               (bld->dims > 1);
 
          rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs);
 
@@ -760,16 +778,28 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
             if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
                 mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
                /*
-                * Don't actually need both all the time, ipart is needed
-                * for nearest mipfilter, pos_or_zero if min != mag.
+                * Don't actually need both values all the time, lod_ipart is
+                * needed for nearest mipfilter, lod_positive if min != mag.
                 */
-               *out_lod_ipart = lp_build_ilog2(lodf_bld, rho);
+               if (rho_squared) {
+                  *out_lod_ipart = lp_build_ilog2_sqrt(lodf_bld, rho);
+               }
+               else {
+                  *out_lod_ipart = lp_build_ilog2(lodf_bld, rho);
+               }
                *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
                                                 rho, lodf_bld->one);
                return;
             }
             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
-                !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
+                !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR) &&
+                !rho_squared) {
+               /*
+                * This can't work if rho is squared. Not sure if it could be
+                * fixed while keeping it worthwile, could also do sqrt here
+                * but brilinear and no_rho_opt seems like a combination not
+                * making much sense anyway so just use ordinary path below.
+                */
                lp_build_brilinear_rho(lodf_bld, rho, BRILINEAR_FACTOR,
                                       out_lod_ipart, out_lod_fpart);
                *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
@@ -784,6 +814,11 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
          else {
             lod = lp_build_fast_log2(lodf_bld, rho);
          }
+         if (rho_squared) {
+            /* log2(x^2) == 0.5*log2(x) */
+            lod = lp_build_mul(lodf_bld, lod,
+                               lp_build_const_vec(bld->gallivm, lodf_bld->type, 0.5F));
+         }
 
          /* add shader lod bias */
          if (lod_bias) {
-- 
2.7.4