[Mesa-dev] [PATCH 2/2] gallivm: change cubemaps / derivatives handling, take 55

Tue Apr 16 12:07:07 PDT 2013

From: Roland Scheidegger <sroland at vmware.com>

Turns out the previous "fix" for handling per-pixel face selection and
derivatives didn't work out that well - the derivatives were wrong by
quite a bit, in theory transformation of the derivatives into cube space
should work, but would be _a lot_ more work than the "simplified" transform
used.
So, for explicit derivatives, I'm just giving up and go back to not honoring
them.
For implicit derivatives (and the fake explicit ones) however we try
something a little different, we just calculate rho as we would for a 3d
texture, that is after scaling the coords by the inverse major axis.
This gives the same results as calculating the derivs after projection of
the coords to the same face as long as all pixels hit the same face (and
only without rho_no_opt, otherwise it should be a bit worse). And when
not all pixels are hitting the same face, the results aren't so hot but
not catastrophically bad (I believe not off by more than a factor of 2 without
no_rho_opt and not more than sqrt(2) with no_rho_opt). I think this is better
than just picking the wrong face but who knows...
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c     |  212 +++++++++++----------
 src/gallium/auxiliary/gallivm/lp_bld_sample.h     |    3 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |    8 +-
 3 files changed, 119 insertions(+), 104 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index ae1e003..154bcad 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -245,16 +245,17 @@ lp_build_rho(struct lp_build_sample_context *bld,
       LLVMValueRef cubesize;
       LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
       /*
-       * If we have derivs too then we have per-pixel cube_rho - doesn't matter
-       * though until we do per-pixel lod.
        * Cube map code did already everything except size mul and per-quad extraction.
        */
+      rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                      perquadf_bld->type, cube_rho, 0);
+      if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_OPT) {
+         rho = lp_build_sqrt(perquadf_bld, rho);
+      }
       /* Could optimize this for single quad just skip the broadcast */
       cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
-                                            coord_bld->type, float_size, index0);
-      rho_vec = lp_build_mul(coord_bld, cubesize, cube_rho);
-      rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                      perquadf_bld->type, rho_vec, 0);
+                                            perquadf_bld->type, float_size, index0);
+      rho = lp_build_mul(perquadf_bld, cubesize, rho);
    }
    else if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) {
       LLVMValueRef ddmax[3], ddx[3], ddy[3];
@@ -1356,25 +1357,31 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
                      LLVMValueRef *face,
                      LLVMValueRef *face_s,
                      LLVMValueRef *face_t,
-                     LLVMValueRef *rho)
+                     LLVMValueRef *rho,
+                     boolean need_derivs)
 {
    struct lp_build_context *coord_bld = &bld->coord_bld;
    LLVMBuilderRef builder = bld->gallivm->builder;
    struct gallivm_state *gallivm = bld->gallivm;
    LLVMValueRef si, ti, ri;
-   boolean need_derivs = TRUE;
 
    if (1 || coord_bld->type.length > 4) {
       /*
        * Do per-pixel face selection. We cannot however (as we used to do)
        * simply calculate the derivs afterwards (which is very bogus for
-       * explicit derivs anyway) because the values would be "random" when
-       * not all pixels lie on the same face. Hence just transform the derivs
-       * (or rather only the dmax values), which works both for implicit and
-       * explicit derivatives and doesn't add much math (except need to
-       * calculate derivs for 3 instead of 2 coords and have a couple more selects
-       * but cuts some minor math elsewhere). The derivs don't need mirroring,
-       * just selection, since noone cares about the sign.
+       * explicit derivs btw) because the values would be "random" when
+       * not all pixels lie on the same face. So what we do here is just
+       * calculate the derivatives after scaling the coords by the absolute
+       * value of the inverse major axis, and essentially do rho calculation
+       * steps as if it were a 3d texture. This is perfect if all pixels hit
+       * the same face, but not so great at edges, I believe the max error
+       * should be sqrt(2) with no_rho_opt or 2 otherwise (essentially measuring
+       * the 3d distance between 2 points on the cube instead of measuring up/down
+       * the edge). Still this is possibly a win over just selecting the same face
+       * for all pixels. Unfortunately, something like that doesn't work for
+       * explicit derivatives.
+       * TODO: handle explicit derivatives by transforming them alongside coords
+       * somehow.
        */
       struct lp_build_context *cint_bld = &bld->int_coord_bld;
       struct lp_type intctype = cint_bld->type;
@@ -1392,95 +1399,111 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
       LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
       LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
       LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
-      LLVMValueRef dmax[3], dmaxsnew, dmaxtnew;
 
       assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
       assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
       assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
 
       /*
-       * TODO do this only when needed.
+       * get absolute value (for x/y/z face selection) and sign bit
+       * (for mirroring minor coords and pos/neg face selection)
+       * of the original coords.
        */
-      if (need_derivs && !derivs) {
-         LLVMValueRef ddx_ddy[2], tmp[2];
-         /*
-          * This isn't quite the same as the "ordinary" path since
-          * we need to extract the ds/dt/dr values before further processing.
-          */
-         static const unsigned char swizzle11[] = { /* no-op swizzle */
+      as = lp_build_abs(&bld->coord_bld, s);
+      at = lp_build_abs(&bld->coord_bld, t);
+      ar = lp_build_abs(&bld->coord_bld, r);
+
+      /*
+       * major face determination: select x if x > y else select y
+       * select z if z >= max(x,y) else select previous result
+       * if some axis are the same we chose z over y, y over x - the
+       * dx10 spec seems to ask for it while OpenGL doesn't care (if we
+       * wouldn't care could save a select or two if using different
+       * compares and doing at_g_as_ar last since tnewx and tnewz are the
+       * same).
+       */
+      as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at);
+      maxasat = lp_build_max(coord_bld, as, at);
+      ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
+
+      if (need_derivs) {
+         LLVMValueRef ddx_ddy[2], tmp[2], rho_vec;
+         static const unsigned char swizzle0[] = { /* no-op swizzle */
             0, LP_BLD_SWIZZLE_DONTCARE,
             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
          };
-         static const unsigned char swizzle12[] = {
-            2, LP_BLD_SWIZZLE_DONTCARE,
+         static const unsigned char swizzle1[] = {
+            1, LP_BLD_SWIZZLE_DONTCARE,
             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
          };
-         static const unsigned char swizzle21[] = { /* no-op swizzle */
-            0, LP_BLD_SWIZZLE_DONTCARE,
-            2, LP_BLD_SWIZZLE_DONTCARE
+         static const unsigned char swizzle01[] = { /* no-op swizzle */
+            0, 1,
+            LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
          };
-         static const unsigned char swizzle22[] = {
-            1, LP_BLD_SWIZZLE_DONTCARE,
-            3, LP_BLD_SWIZZLE_DONTCARE
+         static const unsigned char swizzle23[] = {
+            2, 3,
+            LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+         };
+         static const unsigned char swizzle02[] = {
+            0, 2,
+            LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
          };
 
+         /*
+          * scale the s/t/r coords pre-select/mirror so we can calculate
+          * "reasonable" derivs.
+          */
+         ma = lp_build_select(coord_bld, as_ge_at, s, t);
+         ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
+         ima = lp_build_cube_imapos(coord_bld, ma);
+         s = lp_build_mul(coord_bld, s, ima);
+         t = lp_build_mul(coord_bld, t, ima);
+         r = lp_build_mul(coord_bld, r, ima);
+
+         /*
+          * This isn't quite the same as the "ordinary" (3d deriv) path since we
+          * know the texture is square which simplifies things (we can omit the
+          * size mul which happens very early completely here and do it at the
+          * very end).
+          */
          ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
          ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
-         ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
-         ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
 
-         tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle21);
-         tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle22);
-         dmax[0] = lp_build_max(coord_bld, tmp[0], tmp[1]);
-         dmax[1] = lp_build_swizzle_aos(coord_bld, dmax[0], swizzle12);
+         if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_OPT) {
+            ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
+            ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
+         }
+         else {
+            ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
+            ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
+         }
 
-         tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle11);
-         tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle12);
-         dmax[2] = lp_build_max(coord_bld, tmp[0], tmp[1]);
-      }
-      else if (need_derivs) {
-         LLVMValueRef abs_ddx[3], abs_ddy[3];
-         abs_ddx[0] = lp_build_abs(coord_bld, derivs->ddx[0]);
-         abs_ddx[1] = lp_build_abs(coord_bld, derivs->ddx[1]);
-         abs_ddx[2] = lp_build_abs(coord_bld, derivs->ddx[2]);
-         abs_ddy[0] = lp_build_abs(coord_bld, derivs->ddy[0]);
-         abs_ddy[1] = lp_build_abs(coord_bld, derivs->ddy[1]);
-         abs_ddy[2] = lp_build_abs(coord_bld, derivs->ddy[2]);
-         dmax[0] = lp_build_max(coord_bld, abs_ddx[0], abs_ddy[0]);
-         dmax[1] = lp_build_max(coord_bld, abs_ddx[1], abs_ddy[1]);
-         dmax[2] = lp_build_max(coord_bld, abs_ddx[2], abs_ddy[2]);
+         tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
+         tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
+         tmp[2] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
+
+         if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_OPT) {
+            rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
+            rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
+         }
+         else {
+            rho_vec = lp_build_max(coord_bld, tmp[0], tmp[1]);
+            rho_vec = lp_build_max(coord_bld, rho_vec, tmp[2]);
+         }
+
+         tmp[0] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
+         tmp[1] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
+         *rho = lp_build_max(coord_bld, tmp[0], tmp[1]);
       }
 
       si = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), "");
       ti = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), "");
       ri = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), "");
-
-      /*
-       * get absolute value (for x/y/z face selection) and sign bit
-       * (for mirroring minor coords and pos/neg face selection)
-       * of the original coords.
-       */
-      as = lp_build_abs(&bld->coord_bld, s);
-      at = lp_build_abs(&bld->coord_bld, t);
-      ar = lp_build_abs(&bld->coord_bld, r);
       signs = LLVMBuildAnd(builder, si, signmask, "");
       signt = LLVMBuildAnd(builder, ti, signmask, "");
       signr = LLVMBuildAnd(builder, ri, signmask, "");
 
       /*
-       * major face determination: select x if x > y else select y
-       * select z if z >= max(x,y) else select previous result
-       * if some axis are the same we chose z over y, y over x - the
-       * dx10 spec seems to ask for it while OpenGL doesn't care (if we
-       * wouldn't care could save a select or two if using different
-       * compares and doing at_g_as_ar last since tnewx and tnewz are the
-       * same).
-       */
-      as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at);
-      maxasat = lp_build_max(coord_bld, as, at);
-      ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
-
-      /*
        * compute all possible new s/t coords
        * snewx = signs * -r;
        * tnewx = -t;
@@ -1510,23 +1533,19 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
        */
 
       /* select/mirror */
+      if (!need_derivs) {
+         ma = lp_build_select(coord_bld, as_ge_at, s, t);
+      }
       *face_s = lp_build_select(cint_bld, as_ge_at, snewx, snewy);
       *face_t = lp_build_select(cint_bld, as_ge_at, tnewx, tnewy);
-      ma = lp_build_select(coord_bld, as_ge_at, s, t);
       *face = lp_build_select(cint_bld, as_ge_at, facex, facey);
-      if (need_derivs) {
-         dmaxsnew = lp_build_select(coord_bld, as_ge_at, dmax[2], dmax[0]);
-         dmaxtnew = lp_build_select(coord_bld, as_ge_at, dmax[1], dmax[2]);
-      }
 
+      if (!need_derivs) {
+         ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
+      }
       *face_s = lp_build_select(cint_bld, ar_ge_as_at, snewz, *face_s);
       *face_t = lp_build_select(cint_bld, ar_ge_as_at, tnewz, *face_t);
-      ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
       *face = lp_build_select(cint_bld, ar_ge_as_at, facez, *face);
-      if (need_derivs) {
-         dmaxsnew = lp_build_select(coord_bld, ar_ge_as_at, dmax[0], dmaxsnew);
-         dmaxtnew = lp_build_select(coord_bld, ar_ge_as_at, dmax[1], dmaxtnew);
-      }
 
       *face_s = LLVMBuildBitCast(builder, *face_s,
                                lp_build_vec_type(gallivm, coord_bld->type), "");
@@ -1542,26 +1561,15 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
       signma = LLVMBuildLShr(builder, mai, signshift, "");
       *face = LLVMBuildOr(builder, *face, signma, "face");
 
-      ima = lp_build_cube_imapos(coord_bld, ma);
-
       /* project coords */
-      *face_s = lp_build_mul(coord_bld, *face_s, ima);
+      if (!need_derivs) {
+         ima = lp_build_cube_imapos(coord_bld, ma);
+         *face_s = lp_build_mul(coord_bld, *face_s, ima);
+         *face_t = lp_build_mul(coord_bld, *face_t, ima);
+      }
+
       *face_s = lp_build_add(coord_bld, *face_s, posHalf);
-      *face_t = lp_build_mul(coord_bld, *face_t, ima);
       *face_t = lp_build_add(coord_bld, *face_t, posHalf);
-
-      /* project derivs */
-      if (need_derivs) {
-         /*
-          * we do some optimization here, since we know it's square
-          * we can do the max before projection (and before size mul,
-          * which the so-called "rho" is missing here).
-          * For explicit derivs this is fully per-pixel vector, for implicit
-          * derivs only the first value per quad contains useful values.
-          */
-         *rho = lp_build_max(coord_bld, dmaxsnew, dmaxtnew);
-         *rho = lp_build_mul(coord_bld, *rho, ima);
-      }
    }
 
    else {
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 72af813..cde8ce9 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -437,7 +437,8 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
                      LLVMValueRef *face,
                      LLVMValueRef *face_s,
                      LLVMValueRef *face_t,
-                     LLVMValueRef *rho);
+                     LLVMValueRef *rho,
+                     boolean need_derivs);
 
 
 void
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index d2cc0f3..ced2103 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -1102,7 +1102,13 @@ lp_build_sample_common(struct lp_build_sample_context *bld,
     */
    if (target == PIPE_TEXTURE_CUBE) {
       LLVMValueRef face, face_s, face_t;
-      lp_build_cube_lookup(bld, *s, *t, *r, derivs, &face, &face_s, &face_t, &cube_rho);
+      boolean need_derivs;
+      need_derivs = ((min_filter != mag_filter ||
+                      mip_filter != PIPE_TEX_MIPFILTER_NONE) &&
+                      !bld->static_sampler_state->min_max_lod_equal &&
+                      !explicit_lod);
+      lp_build_cube_lookup(bld, *s, *t, *r, derivs, &face, &face_s, &face_t,
+                           &cube_rho, need_derivs);
       *s = face_s; /* vec */
       *t = face_t; /* vec */
       /* use 'r' to indicate cube face */
-- 
1.7.9.5