[Mesa-dev] [PATCH 2/3] gallivm: implement TG4 for ARB_texture_gather

Sun Mar 29 17:12:31 PDT 2015

From: Roland Scheidegger <sroland at vmware.com>

This is quite trivial, essentially just follow all the same code you'd
use with linear min/mag (and no mip) filter, then just skip the filtering
after looking up the texels in favor of direct assignment of the right channel
to the result. (This is though not true for the multi-offset version if we'd
want to support it - for this would probably need to do something along the
lines of 4x nearest sampling due to the necessity of doing coord wrapping
individually per texel.)
Supports multi-channel formats.
>From the SM5 gather cap bit, should support non-constant offsets, plus shadow
comparisons (the former untested), but not component selection (should be
easy to implement but all this stuff is not really exposable anyway for now).
---
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c | 137 +++++++++++++++++-----
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c   |  36 ++++--
 2 files changed, 133 insertions(+), 40 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 962f478..ff508e2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -840,6 +840,7 @@ lp_build_masklerp2d(struct lp_build_context *bld,
  */
 static void
 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
+                             boolean is_gather,
                              LLVMValueRef size,
                              LLVMValueRef linear_mask,
                              LLVMValueRef row_stride_vec,
@@ -853,6 +854,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    LLVMBuilderRef builder = bld->gallivm->builder;
    struct lp_build_context *ivec_bld = &bld->int_coord_bld;
    struct lp_build_context *coord_bld = &bld->coord_bld;
+   struct lp_build_context *texel_bld = &bld->texel_bld;
    const unsigned dims = bld->dims;
    LLVMValueRef width_vec;
    LLVMValueRef height_vec;
@@ -875,7 +877,16 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    seamless_cube_filter = (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
                            bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
                           bld->static_sampler_state->seamless_cube_map;
-   accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter;
+   /*
+    * XXX I don't know how this is really supposed to work with gather. From GL
+    * spec wording (not gather specific) it sounds like the 4th missing texel
+    * should be an average of the other 3, hence for gather could return this.
+    * This is however NOT how the code here works, which just fixes up the
+    * weights used for filtering instead. And of course for gather there is
+    * no filter to tweak...
+    */
+   accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter &&
+                           !is_gather;
 
    lp_build_extract_image_sizes(bld,
                                 &bld->int_size_bld,
@@ -1160,10 +1171,11 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                              data_ptr, mipoffsets, neighbors[0][1]);
 
    if (dims == 1) {
+      assert(!is_gather);
       if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
          /* Interpolate two samples from 1D image to produce one color */
          for (chan = 0; chan < 4; chan++) {
-            colors_out[chan] = lp_build_lerp(&bld->texel_bld, s_fpart,
+            colors_out[chan] = lp_build_lerp(texel_bld, s_fpart,
                                              neighbors[0][0][chan],
                                              neighbors[0][1][chan],
                                              0);
@@ -1174,7 +1186,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
          cmpval0 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
          cmpval1 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
          /* simplified lerp, AND mask with weight and add */
-         colors_out[0] = lp_build_masklerp(&bld->texel_bld, s_fpart,
+         colors_out[0] = lp_build_masklerp(texel_bld, s_fpart,
                                            cmpval0, cmpval1);
          colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
       }
@@ -1301,15 +1313,38 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
       }
 
       if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
-         /* Bilinear interpolate the four samples from the 2D image / 3D slice */
-         for (chan = 0; chan < 4; chan++) {
-            colors0[chan] = lp_build_lerp_2d(&bld->texel_bld,
-                                             s_fpart, t_fpart,
-                                             neighbors[0][0][chan],
-                                             neighbors[0][1][chan],
-                                             neighbors[1][0][chan],
-                                             neighbors[1][1][chan],
-                                             0);
+         if (is_gather) {
+            /*
+             * Just assign the red channel (no component selection yet).
+             * This is a bit hackish, we usually do the swizzle at the
+             * end of sampling (much less values to swizzle), but this
+             * obviously cannot work when using gather.
+             */
+            unsigned chan_swiz = bld->static_texture_state->swizzle_r;
+            colors0[0] = lp_build_swizzle_soa_channel(texel_bld,
+                                                      neighbors[1][0],
+                                                      chan_swiz);
+            colors0[1] = lp_build_swizzle_soa_channel(texel_bld,
+                                                      neighbors[1][1],
+                                                      chan_swiz);
+            colors0[2] = lp_build_swizzle_soa_channel(texel_bld,
+                                                      neighbors[0][1],
+                                                      chan_swiz);
+            colors0[3] = lp_build_swizzle_soa_channel(texel_bld,
+                                                      neighbors[0][0],
+                                                      chan_swiz);
+         }
+         else {
+            /* Bilinear interpolate the four samples from the 2D image / 3D slice */
+            for (chan = 0; chan < 4; chan++) {
+               colors0[chan] = lp_build_lerp_2d(texel_bld,
+                                                s_fpart, t_fpart,
+                                                neighbors[0][0][chan],
+                                                neighbors[0][1][chan],
+                                                neighbors[1][0][chan],
+                                                neighbors[1][1][chan],
+                                                0);
+            }
          }
       }
       else {
@@ -1318,9 +1353,34 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
          cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
          cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
          cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
-         colors0[0] = lp_build_masklerp2d(&bld->texel_bld, s_fpart, t_fpart,
-                                          cmpval00, cmpval01, cmpval10, cmpval11);
-         colors0[1] = colors0[2] = colors0[3] = colors0[0];
+
+         if (is_gather) {
+            /* more hacks for swizzling, should be X, ONE or ZERO... */
+            unsigned chan_swiz = bld->static_texture_state->swizzle_r;
+            if (chan_swiz <= PIPE_SWIZZLE_ALPHA) {
+               colors0[0] = lp_build_select(texel_bld, cmpval10,
+                                            texel_bld->one, texel_bld->zero);
+               colors0[1] = lp_build_select(texel_bld, cmpval11,
+                                            texel_bld->one, texel_bld->zero);
+               colors0[2] = lp_build_select(texel_bld, cmpval01,
+                                            texel_bld->one, texel_bld->zero);
+               colors0[3] = lp_build_select(texel_bld, cmpval00,
+                                            texel_bld->one, texel_bld->zero);
+            }
+            else if (chan_swiz == PIPE_SWIZZLE_ZERO) {
+               colors0[0] = colors0[1] = colors0[2] = colors0[3] =
+                            texel_bld->zero;
+            }
+            else {
+               colors0[0] = colors0[1] = colors0[2] = colors0[3] =
+                            texel_bld->one;
+            }
+         }
+         else {
+            colors0[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
+                                             cmpval00, cmpval01, cmpval10, cmpval11);
+            colors0[1] = colors0[2] = colors0[3] = colors0[0];
+         }
       }
 
       if (accurate_cube_corners) {
@@ -1341,6 +1401,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
          LLVMValueRef neighbors1[2][2][4];
          LLVMValueRef colors1[4];
 
+         assert(!is_gather);
+
          /* get x0/x1/y0/y1 texels at z1 */
          lp_build_sample_texel_soa(bld,
                                    width_vec, height_vec, depth_vec,
@@ -1366,7 +1428,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
          if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
             /* Bilinear interpolate the four samples from the second Z slice */
             for (chan = 0; chan < 4; chan++) {
-               colors1[chan] = lp_build_lerp_2d(&bld->texel_bld,
+               colors1[chan] = lp_build_lerp_2d(texel_bld,
                                                 s_fpart, t_fpart,
                                                 neighbors1[0][0][chan],
                                                 neighbors1[0][1][chan],
@@ -1376,7 +1438,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
             }
             /* Linearly interpolate the two samples from the two 3D slices */
             for (chan = 0; chan < 4; chan++) {
-               colors_out[chan] = lp_build_lerp(&bld->texel_bld,
+               colors_out[chan] = lp_build_lerp(texel_bld,
                                                 r_fpart,
                                                 colors0[chan], colors1[chan],
                                                 0);
@@ -1388,13 +1450,13 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
             cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
             cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
             cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
-            colors1[0] = lp_build_masklerp2d(&bld->texel_bld, s_fpart, t_fpart,
+            colors1[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
                                              cmpval00, cmpval01, cmpval10, cmpval11);
             /* Linearly interpolate the two samples from the two 3D slices */
-            colors_out[0] = lp_build_lerp(&bld->texel_bld,
-                                             r_fpart,
-                                             colors0[0], colors1[0],
-                                             0);
+            colors_out[0] = lp_build_lerp(texel_bld,
+                                          r_fpart,
+                                          colors0[0], colors1[0],
+                                          0);
             colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
          }
       }
@@ -1418,6 +1480,7 @@ static void
 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                        unsigned img_filter,
                        unsigned mip_filter,
+                       boolean is_gather,
                        LLVMValueRef *coords,
                        const LLVMValueRef *offsets,
                        LLVMValueRef ilevel0,
@@ -1459,7 +1522,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    }
    else {
       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
-      lp_build_sample_image_linear(bld, size0, NULL,
+      lp_build_sample_image_linear(bld, is_gather, size0, NULL,
                                    row_stride0_vec, img_stride0_vec,
                                    data_ptr0, mipoff0, coords, offsets,
                                    colors0);
@@ -1520,7 +1583,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                                           colors1);
          }
          else {
-            lp_build_sample_image_linear(bld, size1, NULL,
+            lp_build_sample_image_linear(bld, FALSE, size1, NULL,
                                          row_stride1_vec, img_stride1_vec,
                                          data_ptr1, mipoff1, coords, offsets,
                                          colors1);
@@ -1594,7 +1657,7 @@ lp_build_sample_mipmap_both(struct lp_build_sample_context *bld,
       mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
    }
 
-   lp_build_sample_image_linear(bld, size0, linear_mask,
+   lp_build_sample_image_linear(bld, FALSE, size0, linear_mask,
                                 row_stride0_vec, img_stride0_vec,
                                 data_ptr0, mipoff0, coords, offsets,
                                 colors0);
@@ -1638,7 +1701,7 @@ lp_build_sample_mipmap_both(struct lp_build_sample_context *bld,
             mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
          }
 
-         lp_build_sample_image_linear(bld, size1, linear_mask,
+         lp_build_sample_image_linear(bld, FALSE, size1, linear_mask,
                                       row_stride1_vec, img_stride1_vec,
                                       data_ptr1, mipoff1, coords, offsets,
                                       colors1);
@@ -2061,6 +2124,7 @@ lp_build_clamp_border_color(struct lp_build_sample_context *bld,
 static void
 lp_build_sample_general(struct lp_build_sample_context *bld,
                         unsigned sampler_unit,
+                        boolean is_gather,
                         LLVMValueRef *coords,
                         const LLVMValueRef *offsets,
                         LLVMValueRef lod_positive,
@@ -2105,6 +2169,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
    if (min_filter == mag_filter) {
       /* no need to distinguish between minification and magnification */
       lp_build_sample_mipmap(bld, min_filter, mip_filter,
+                             is_gather,
                              coords, offsets,
                              ilevel0, ilevel1, lod_fpart,
                              texels);
@@ -2126,7 +2191,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
          lp_build_if(&if_ctx, bld->gallivm, lod_positive);
          {
             /* Use the minification filter */
-            lp_build_sample_mipmap(bld, min_filter, mip_filter,
+            lp_build_sample_mipmap(bld, min_filter, mip_filter, FALSE,
                                    coords, offsets,
                                    ilevel0, ilevel1, lod_fpart,
                                    texels);
@@ -2135,6 +2200,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
          {
             /* Use the magnification filter */
             lp_build_sample_mipmap(bld, mag_filter, PIPE_TEX_MIPFILTER_NONE,
+                                   FALSE,
                                    coords, offsets,
                                    ilevel0, NULL, NULL,
                                    texels);
@@ -2187,7 +2253,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
              * All pixels require just nearest filtering, which is way
              * cheaper than linear, hence do a separate path for that.
              */
-            lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST,
+            lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST, FALSE,
                                    mip_filter_for_nearest,
                                    coords, offsets,
                                    ilevel0, ilevel1, lod_fpart,
@@ -2488,6 +2554,16 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
    } else {
       derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
    }
+   if (op_type == LP_SAMPLER_OP_GATHER) {
+      /*
+       * gather4 is exactly like GL_LINEAR filtering but in the end skipping
+       * the actual filtering. Using mostly the same paths, so cube face
+       * selection, coord wrapping etc. all naturally uses the same code.
+       */
+      derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+      derived_sampler_state.min_img_filter = PIPE_TEX_FILTER_LINEAR;
+      derived_sampler_state.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
+   }
    mip_filter = derived_sampler_state.min_mip_filter;
 
    if (0) {
@@ -2673,6 +2749,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
       LLVMValueRef lod_fpart = NULL, lod_positive = NULL;
       LLVMValueRef ilevel0 = NULL, ilevel1 = NULL;
       boolean use_aos = util_format_fits_8unorm(bld.format_desc) &&
+                        op_is_tex &&
                         /* not sure this is strictly needed or simply impossible */
                         derived_sampler_state.compare_mode == PIPE_TEX_COMPARE_NONE &&
                         lp_is_simple_wrap_mode(derived_sampler_state.wrap_s);
@@ -2743,6 +2820,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
 
          else {
             lp_build_sample_general(&bld, sampler_index,
+                                    op_type == LP_SAMPLER_OP_GATHER,
                                     newcoords, offsets,
                                     lod_positive, lod_fpart,
                                     ilevel0, ilevel1,
@@ -2889,6 +2967,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
                newcoords4[4] = lp_build_extract_range(gallivm, newcoords[4], 4*i, 4);
 
                lp_build_sample_general(&bld4, sampler_index,
+                                       op_type == LP_SAMPLER_OP_GATHER,
                                        newcoords4, offsets4,
                                        lod_positive4, lod_fpart4,
                                        ilevel04, ilevel14,
@@ -2905,7 +2984,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
       }
    }
 
-   if (target != PIPE_BUFFER) {
+   if (target != PIPE_BUFFER && op_type != LP_SAMPLER_OP_GATHER) {
       apply_sampler_swizzle(&bld, texel_out);
    }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index ae527b2..17b68ff 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -1961,7 +1961,8 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
           const struct tgsi_full_instruction *inst,
           enum lp_build_tex_modifier modifier,
           LLVMValueRef *texel,
-          unsigned sampler_reg)
+          unsigned sampler_reg,
+          enum lp_sampler_op_type sampler_op)
 {
    unsigned unit = inst->Src[sampler_reg].Register.Index;
    LLVMValueRef oow = NULL;
@@ -1974,7 +1975,7 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
    unsigned num_derivs, num_offsets, i;
    unsigned shadow_coord = 0;
    unsigned layer_coord = 0;
-   unsigned sample_key = LP_SAMPLER_OP_TEXTURE << LP_SAMPLER_OP_TYPE_SHIFT;
+   unsigned sample_key = sampler_op << LP_SAMPLER_OP_TYPE_SHIFT;
 
    memset(&params, 0, sizeof(params));
 
@@ -2137,7 +2138,7 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
    }
    sample_key |= lod_property << LP_SAMPLER_LOD_PROPERTY_SHIFT;
 
-   /* some advanced gather instructions (txgo) would require 4 offsets */
+   /* we don't handle the 4 offset version of tg4 */
    if (inst->Texture.NumOffsets == 1) {
       unsigned dim;
       sample_key |= LP_SAMPLER_OFFSETS;
@@ -2972,7 +2973,7 @@ tex_emit(
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
 
    emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
-            emit_data->output, 1);
+            emit_data->output, 1, LP_SAMPLER_OP_TEXTURE);
 }
 
 static void
@@ -2984,7 +2985,7 @@ tex2_emit(
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
 
    emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
-            emit_data->output, 2);
+            emit_data->output, 2, LP_SAMPLER_OP_TEXTURE);
 }
 
 static void
@@ -2996,7 +2997,7 @@ txb_emit(
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
 
    emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_LOD_BIAS,
-            emit_data->output, 1);
+            emit_data->output, 1, LP_SAMPLER_OP_TEXTURE);
 }
 
 static void
@@ -3008,7 +3009,7 @@ txb2_emit(
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
 
    emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_LOD_BIAS,
-            emit_data->output, 2);
+            emit_data->output, 2, LP_SAMPLER_OP_TEXTURE);
 }
 
 static void
@@ -3020,7 +3021,7 @@ txd_emit(
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
 
    emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV,
-            emit_data->output, 3);
+            emit_data->output, 3, LP_SAMPLER_OP_TEXTURE);
 }
 
 static void
@@ -3032,7 +3033,7 @@ txl_emit(
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
 
    emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD,
-            emit_data->output, 1);
+            emit_data->output, 1, LP_SAMPLER_OP_TEXTURE);
 }
 
 static void
@@ -3044,7 +3045,7 @@ txl2_emit(
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
 
    emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD,
-            emit_data->output, 2);
+            emit_data->output, 2, LP_SAMPLER_OP_TEXTURE);
 }
 
 static void
@@ -3056,7 +3057,19 @@ txp_emit(
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
 
    emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_PROJECTED,
-            emit_data->output, 1);
+            emit_data->output, 1, LP_SAMPLER_OP_TEXTURE);
+}
+
+static void
+tg4_emit(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
+
+   emit_tex(bld, emit_data->inst, LP_BLD_TEX_MODIFIER_NONE,
+            emit_data->output, 2, LP_SAMPLER_OP_GATHER);
 }
 
 static void
@@ -3775,6 +3788,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
    bld.bld_base.op_actions[TGSI_OPCODE_TEX2].emit = tex2_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXB2].emit = txb2_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_TXL2].emit = txl2_emit;
+   bld.bld_base.op_actions[TGSI_OPCODE_TG4].emit = tg4_emit;
    /* DX10 sampling ops */
    bld.bld_base.op_actions[TGSI_OPCODE_SAMPLE].emit = sample_emit;
    bld.bld_base.op_actions[TGSI_OPCODE_SAMPLE_B].emit = sample_b_emit;
-- 
1.9.1