[Mesa-dev] [PATCH 2/2] gallivm: don't calculate square root of rho if we use accurate rho method

sroland at vmware.com sroland at vmware.com
Wed Aug 28 07:33:41 PDT 2013


From: Roland Scheidegger <sroland at vmware.com>

While a sqrt here and there shouldn't hurt much (depending on the cpu) it is
possible to completely omit it since rho is only used for calculating lod and
there log2(x) == 0.5*log2(x^2). Depending on the exact path taken for
calculating lod this means we get a simple mul instead of sqrt (in case of
nearest mip filter in fact we don't need to replace the sqrt with something
else at all), only in some not very useful path this doesn't work (combined
brilinear calculation of int level and fractional lod, accurate rho calc but
brilinear filtering seems odd).
Apart from being faster as an added bonus this should increase our crappy
fractional accuracy of lod, since fast_log2 is only good for ~3bits and this
should increase accuracy by one bit (though not used if dimension is just one
as we'd need an extra mul there as we never had the squared rho in the first
place).
---
 src/gallium/auxiliary/gallivm/lp_bld_arit.c   |   20 +++++--
 src/gallium/auxiliary/gallivm/lp_bld_arit.h   |    3 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |   76 +++++++++++++------------
 3 files changed, 56 insertions(+), 43 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 09107ff..c295e22 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -3381,7 +3381,8 @@ lp_build_fast_log2(struct lp_build_context *bld,
  */
 LLVMValueRef
 lp_build_ilog2(struct lp_build_context *bld,
-               LLVMValueRef x)
+               LLVMValueRef x,
+               boolean x_is_squared)
 {
    LLVMBuilderRef builder = bld->gallivm->builder;
    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
@@ -3391,11 +3392,20 @@ lp_build_ilog2(struct lp_build_context *bld,
 
    assert(lp_check_value(bld->type, x));
 
-   /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
-   x = LLVMBuildFMul(builder, x, sqrt2, "");
+   if (x_is_squared) {
+      struct lp_type i_type = lp_int_type(bld->type);
+      LLVMValueRef one = lp_build_const_int_vec(bld->gallivm, i_type, 1);
+      /* ipart = log2(x) + 0.5 = 0.5*(log2(x^2) + 1.0) */
+      ipart = lp_build_extract_exponent(bld, x, 1);
+      ipart = LLVMBuildAShr(builder, ipart, one, "");
+   }
 
-   /* ipart = floor(log2(x) + 0.5)  */
-   ipart = lp_build_extract_exponent(bld, x, 0);
+   else {
+      /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
+      x = LLVMBuildFMul(builder, x, sqrt2, "");
+      /* ipart = floor(log2(x) + 0.5)  */
+      ipart = lp_build_extract_exponent(bld, x, 0);
+   }
 
    return ipart;
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index d98025e..931175c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -323,7 +323,8 @@ lp_build_fast_log2(struct lp_build_context *bld,
 
 LLVMValueRef
 lp_build_ilog2(struct lp_build_context *bld,
-               LLVMValueRef x);
+               LLVMValueRef x,
+               boolean x_is_squared);
 
 void
 lp_build_exp2_approx(struct lp_build_context *bld,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index e1cfd78..c34833a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -232,6 +232,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
    unsigned length = coord_bld->type.length;
    unsigned num_quads = length / 4;
    boolean rho_per_quad = rho_bld->type.length != length;
+   boolean no_rho_opt = (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1);
    unsigned i;
    LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
    LLVMValueRef rho_xvec, rho_yvec;
@@ -264,12 +265,13 @@ lp_build_rho(struct lp_build_sample_context *bld,
       else {
          rho = lp_build_swizzle_scalar_aos(coord_bld, cube_rho, 0, 4);
       }
-      if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
-         rho = lp_build_sqrt(rho_bld, rho);
-      }
       /* Could optimize this for single quad just skip the broadcast */
       cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
                                             rho_bld->type, float_size, index0);
+      if (no_rho_opt) {
+         /* skipping sqrt hence returning rho squared */
+         cubesize = lp_build_mul(rho_bld, cubesize, cubesize);
+      }
       rho = lp_build_mul(rho_bld, cubesize, rho);
    }
    else if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) {
@@ -281,7 +283,11 @@ lp_build_rho(struct lp_build_sample_context *bld,
          floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
                                                coord_bld->type, float_size, indexi);
 
-         if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
+         /*
+          * note that for rho_per_quad case could reduce math (at some shuffle
+          * cost), but for now use same code to per-pixel lod case.
+          */
+         if (no_rho_opt) {
             ddx[i] = lp_build_mul(coord_bld, floatdim, derivs->ddx[i]);
             ddy[i] = lp_build_mul(coord_bld, floatdim, derivs->ddy[i]);
             ddx[i] = lp_build_mul(coord_bld, ddx[i], ddx[i]);
@@ -295,7 +301,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
             ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]);
          }
       }
-      if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
+      if (no_rho_opt) {
          rho_xvec = lp_build_add(coord_bld, ddx[0], ddx[1]);
          rho_yvec = lp_build_add(coord_bld, ddy[0], ddy[1]);
          if (dims > 2) {
@@ -303,19 +309,8 @@ lp_build_rho(struct lp_build_sample_context *bld,
             rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]);
          }
          rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
-
-         if (rho_per_quad) {
-            /*
-             * note for this case without per-pixel lod could reduce math more
-             * (at some shuffle cost), but for now only do sqrt after packing,
-             * otherwise would also need different code to per-pixel lod case.
-             */
-            rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                            rho_bld->type, rho, 0);
-         }
-         rho = lp_build_sqrt(rho_bld, rho);
-
-      }
+         /* skipping sqrt hence returning rho squared */
+     }
       else {
          rho = ddmax[0];
          if (dims > 1) {
@@ -324,13 +319,13 @@ lp_build_rho(struct lp_build_sample_context *bld,
                rho = lp_build_max(coord_bld, rho, ddmax[2]);
             }
          }
-         if (rho_per_quad) {
-            /*
-             * rho_vec contains per-pixel rho, convert to scalar per quad.
-             */
-            rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                            rho_bld->type, rho, 0);
-         }
+      }
+      if (rho_per_quad) {
+         /*
+          * rho_vec contains per-pixel rho, convert to scalar per quad.
+          */
+         rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                         rho_bld->type, rho, 0);
       }
    }
    else {
@@ -362,7 +357,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
          }
       }
 
-      if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
+      if (no_rho_opt) {
          static const unsigned char swizzle01[] = { /* no-op swizzle */
             0, 1,
             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
@@ -407,16 +402,9 @@ lp_build_rho(struct lp_build_sample_context *bld,
                                             rho_bld->type, rho, 0);
          }
          else {
-            /*
-             * on some cpus with half-speed 8-wide sqrt (e.g. SNB but not IVB)
-             * doing pack/sqrt/unpack/swizzle might be better for 8-wide case,
-             * same is true for cpus having faster scalars than 4-wide vecs
-             * for 4-wide case (where pack/unpack would be no-ops anyway).
-             * (Same is true really for cube_rho case above.)
-             */
             rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
          }
-         rho = lp_build_sqrt(rho_bld, rho);
+         /* skipping sqrt hence returning rho squared */
       }
       else {
          ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
@@ -636,7 +624,7 @@ lp_build_brilinear_rho(struct lp_build_context *bld,
 
    /*
     * The pre factor will make the intersections with the exact powers of two
-    * happen precisely where we want then to be, which means that the integer
+    * happen precisely where we want them to be, which means that the integer
     * part will not need any post adjustments.
     */
    rho = lp_build_mul(bld, rho,
@@ -740,6 +728,8 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
       }
       else {
          LLVMValueRef rho;
+         boolean rho_squared = (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) &&
+                               (bld->dims > 1);
 
          rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs);
 
@@ -763,13 +753,20 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
                 * Don't actually need both all the time, ipart is needed
                 * for nearest mipfilter, pos_or_zero if min != mag.
                 */
-               *out_lod_ipart = lp_build_ilog2(lodf_bld, rho);
+               *out_lod_ipart = lp_build_ilog2(lodf_bld, rho, rho_squared);
                *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
                                                 rho, lodf_bld->one);
                return;
             }
             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
-                !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
+                !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR) &&
+                !rho_squared) {
+               /*
+                * This can't work if rho is squared. Not sure if it could be
+                * fixed while keeping it worthwile, could also do sqrt here
+                * but brilinear and no_rho_opt seems like a combination not
+                * making much sense anyway so just use ordinary path below.
+                */
                lp_build_brilinear_rho(lodf_bld, rho, BRILINEAR_FACTOR,
                                       out_lod_ipart, out_lod_fpart);
                *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
@@ -784,6 +781,11 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
          else {
             lod = lp_build_fast_log2(lodf_bld, rho);
          }
+         if (rho_squared) {
+            /* log2(x^2) == 0.5*log2(x) */
+            lod = lp_build_mul(lodf_bld, lod,
+                               lp_build_const_vec(bld->gallivm, lodf_bld->type, 0.5F));
+         }
 
          /* add shader lod bias */
          if (lod_bias) {
-- 
1.7.9.5


More information about the mesa-dev mailing list