Mesa (main): gallivm: add support for anisotropic sampling.

Wed Jul 21 00:40:08 UTC 2021

Module: Mesa
Branch: main
Commit: ce2b711c0a5d2c1d595f783e7a7bc92388cef4e7
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=ce2b711c0a5d2c1d595f783e7a7bc92388cef4e7

Author: Dave Airlie <airlied at redhat.com>
Date:   Mon Feb  8 12:32:04 2021 +1000

gallivm: add support for anisotropic sampling.

This is a port of the softpipe anisotropic filtering
to llvmpipe. It should produce pretty similiar results.

This contains the proposed fix to the softpipe calculating
dq after scaling.

It also contains a number of other fixes around vector lengths
etc caught during test.

Reviewed-by: Roland Scheidegger <sroland at vmware.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8804>

---

 src/gallium/auxiliary/gallivm/lp_bld_sample.c     | 106 ++++-
 src/gallium/auxiliary/gallivm/lp_bld_sample.h     |   9 +
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c | 450 +++++++++++++++++++++-
 3 files changed, 559 insertions(+), 6 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 99ddd1cb32f..df8a1ba2192 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -194,6 +194,8 @@ lp_sampler_static_sampler_state(struct lp_static_sampler_state *state,
    state->min_mip_filter    = sampler->min_mip_filter;
    state->seamless_cube_map = sampler->seamless_cube_map;
    state->reduction_mode    = sampler->reduction_mode;
+   state->aniso = sampler->max_anisotropy > 1.0f;
+
    if (sampler->max_lod > 0.0f) {
       state->max_lod_pos = 1;
    }
@@ -233,6 +235,94 @@ lp_sampler_static_sampler_state(struct lp_static_sampler_state *state,
    state->normalized_coords = sampler->normalized_coords;
 }
 
+/* build aniso pmin value */
+static LLVMValueRef
+lp_build_pmin(struct lp_build_sample_context *bld,
+              unsigned texture_unit,
+              LLVMValueRef s,
+              LLVMValueRef t,
+              LLVMValueRef max_aniso)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   struct lp_build_context *int_size_bld = &bld->int_size_in_bld;
+   struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
+   struct lp_build_context *pmin_bld = &bld->lodf_bld;
+   LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
+   LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
+   LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
+   LLVMValueRef ddx_ddy = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
+   LLVMValueRef int_size, float_size;
+   LLVMValueRef first_level, first_level_vec;
+   unsigned length = coord_bld->type.length;
+   unsigned num_quads = length / 4;
+   boolean pmin_per_quad = pmin_bld->type.length != length;
+   unsigned i;
+
+   first_level = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm,
+                                                 bld->context_ptr, texture_unit, NULL);
+   first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
+   int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec, TRUE);
+   float_size = lp_build_int_to_float(float_size_bld, int_size);
+   max_aniso = lp_build_broadcast_scalar(coord_bld, max_aniso);
+   max_aniso = lp_build_mul(coord_bld, max_aniso, max_aniso);
+
+   static const unsigned char swizzle01[] = { /* no-op swizzle */
+      0, 1,
+      LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+   };
+   static const unsigned char swizzle23[] = {
+      2, 3,
+      LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+   };
+   LLVMValueRef ddx_ddys, ddx_ddyt, floatdim, shuffles[LP_MAX_VECTOR_LENGTH / 4];
+
+   for (i = 0; i < num_quads; i++) {
+      shuffles[i*4+0] = shuffles[i*4+1] = index0;
+      shuffles[i*4+2] = shuffles[i*4+3] = index1;
+   }
+   floatdim = LLVMBuildShuffleVector(builder, float_size, float_size,
+                                     LLVMConstVector(shuffles, length), "");
+   ddx_ddy = lp_build_mul(coord_bld, ddx_ddy, floatdim);
+
+   ddx_ddy = lp_build_mul(coord_bld, ddx_ddy, ddx_ddy);
+
+   ddx_ddys = lp_build_swizzle_aos(coord_bld, ddx_ddy, swizzle01);
+   ddx_ddyt = lp_build_swizzle_aos(coord_bld, ddx_ddy, swizzle23);
+
+   LLVMValueRef px2_py2 = lp_build_add(coord_bld, ddx_ddys, ddx_ddyt);
+
+   static const unsigned char swizzle0[] = { /* no-op swizzle */
+     0, LP_BLD_SWIZZLE_DONTCARE,
+     LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+   };
+   static const unsigned char swizzle1[] = {
+     1, LP_BLD_SWIZZLE_DONTCARE,
+     LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+   };
+   LLVMValueRef px2 = lp_build_swizzle_aos(coord_bld, px2_py2, swizzle0);
+   LLVMValueRef py2 = lp_build_swizzle_aos(coord_bld, px2_py2, swizzle1);
+
+   LLVMValueRef pmax2 = lp_build_max(coord_bld, px2, py2);
+   LLVMValueRef pmin2 = lp_build_min(coord_bld, px2, py2);
+
+   LLVMValueRef temp = lp_build_mul(coord_bld, pmin2, max_aniso);
+
+   LLVMValueRef comp = lp_build_compare(gallivm, coord_bld->type, PIPE_FUNC_GREATER,
+                                        pmin2, temp);
+
+   LLVMValueRef pmin2_alt = lp_build_div(coord_bld, pmax2, max_aniso);
+
+   pmin2 = lp_build_select(coord_bld, comp, pmin2_alt, pmin2);
+
+   if (pmin_per_quad)
+      pmin2 = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                        pmin_bld->type, pmin2, 0);
+   else
+      pmin2 = lp_build_swizzle_scalar_aos(pmin_bld, pmin2, 0, 4);
+   return pmin2;
+}
 
 /**
  * Generate code to compute coordinate gradient (rho).
@@ -740,6 +830,7 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
                       LLVMValueRef lod_bias, /* optional */
                       LLVMValueRef explicit_lod, /* optional */
                       unsigned mip_filter,
+                      LLVMValueRef max_aniso,
                       LLVMValueRef *out_lod,
                       LLVMValueRef *out_lod_ipart,
                       LLVMValueRef *out_lod_fpart,
@@ -796,13 +887,19 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
          boolean rho_squared = (bld->no_rho_approx &&
                                 (bld->dims > 1)) || cube_rho;
 
-         rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs);
+         if (bld->static_sampler_state->aniso &&
+             !explicit_lod) {
+            rho = lp_build_pmin(bld, texture_unit, s, t, max_aniso);
+            rho_squared = true;
+         } else
+            rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs);
 
          /*
           * Compute lod = log2(rho)
           */
 
          if (!lod_bias && !is_lodq &&
+             !bld->static_sampler_state->aniso &&
              !bld->static_sampler_state->lod_bias_non_zero &&
              !bld->static_sampler_state->apply_max_lod &&
              !bld->static_sampler_state->apply_min_lod) {
@@ -829,7 +926,8 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
                return;
             }
             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
-                !bld->no_brilinear && !rho_squared) {
+                !bld->no_brilinear && !rho_squared &&
+                !bld->static_sampler_state->aniso) {
                /*
                 * This can't work if rho is squared. Not sure if it could be
                 * fixed while keeping it worthwile, could also do sqrt here
@@ -908,7 +1006,9 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
    *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
                                     lod, lodf_bld->zero);
 
-   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+   if (bld->static_sampler_state->aniso) {
+      *out_lod_ipart = lp_build_itrunc(lodf_bld, lod);
+   } else if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
       if (!bld->no_brilinear) {
          lp_build_brilinear_lod(lodf_bld, lod, BRILINEAR_FACTOR,
                                 out_lod_ipart, out_lod_fpart);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 374eee2c7bb..580daa89dc8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -202,6 +202,7 @@ struct lp_static_sampler_state
    unsigned apply_min_lod:1;  /**< min_lod > 0 ? */
    unsigned apply_max_lod:1;  /**< max_lod < last_level ? */
    unsigned seamless_cube_map:1;
+   unsigned aniso:1;
 
    /* Hacks */
    unsigned force_nearest_s:1;
@@ -331,6 +332,13 @@ struct lp_sampler_dynamic_state
                    LLVMValueRef context_ptr,
                    unsigned sampler_unit);
 
+   /** Obtain maximum anisotropy */
+   LLVMValueRef
+   (*max_aniso)(const struct lp_sampler_dynamic_state *state,
+                struct gallivm_state *gallivm,
+                LLVMValueRef context_ptr,
+                unsigned sampler_unit);
+
    /** 
     * Obtain texture cache (returns ptr to lp_build_format_cache).
     *
@@ -580,6 +588,7 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
                       LLVMValueRef lod_bias, /* optional */
                       LLVMValueRef explicit_lod, /* optional */
                       unsigned mip_filter,
+                      LLVMValueRef max_aniso,
                       LLVMValueRef *out_lod,
                       LLVMValueRef *out_lod_ipart,
                       LLVMValueRef *out_lod_fpart,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index c5dcab468d2..c5616be348a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -2067,6 +2067,429 @@ lp_build_layer_coord(struct lp_build_sample_context *bld,
    }
 }
 
+#define WEIGHT_LUT_SIZE 1024
+
+static void
+lp_build_sample_aniso(struct lp_build_sample_context *bld,
+                      unsigned img_filter,
+                      unsigned mip_filter,
+                      boolean is_gather,
+                      const LLVMValueRef *coords,
+                      const LLVMValueRef *offsets,
+                      LLVMValueRef ilevel0,
+                      LLVMValueRef ilevel1,
+                      LLVMValueRef lod_fpart,
+                      LLVMValueRef *colors_out)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
+   LLVMValueRef ddx_ddy = lp_build_packed_ddx_ddy_twocoord(&bld->coord_bld, coords[0], coords[1]);
+   LLVMValueRef float_size;
+   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+   LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
+   LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
+   unsigned length = bld->coord_bld.type.length;
+   unsigned num_quads = length / 4;
+   unsigned i;
+   LLVMValueRef filter_table = bld->aniso_filter_table;
+   LLVMValueRef size0, row_stride0_vec, img_stride0_vec;
+   LLVMValueRef data_ptr0, mipoff0 = NULL;
+
+   lp_build_mipmap_level_sizes(bld, ilevel0,
+                               &size0,
+                               &row_stride0_vec, &img_stride0_vec);
+   if (bld->num_mips == 1) {
+      data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
+   }
+   else {
+      /* This path should work for num_lods 1 too but slightly less efficient */
+      data_ptr0 = bld->base_ptr;
+      mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
+   }
+
+   float_size = lp_build_int_to_float(&bld->float_size_in_bld, bld->int_size);
+
+   LLVMValueRef float_size_lvl = lp_build_int_to_float(&bld->float_size_bld, size0);
+   /* extract width and height into vectors for use later */
+   static const unsigned char swizzle15[] = { /* no-op swizzle */
+      1, 1, 1, 1, 5, 5, 5, 5
+   };
+   static const unsigned char swizzle04[] = { /* no-op swizzle */
+      0, 0, 0, 0, 4, 4, 4, 4
+   };
+   LLVMValueRef width_dim, height_dim;
+
+   width_dim = lp_build_swizzle_aos_n(gallivm, float_size_lvl, swizzle04, bld->float_size_bld.type.length, bld->coord_bld.type.length);
+   height_dim = lp_build_swizzle_aos_n(gallivm, float_size_lvl, swizzle15, bld->float_size_bld.type.length, bld->coord_bld.type.length);
+
+
+   /* shuffle width/height for ddx/ddy calculations. */
+   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
+
+   for (i = 0; i < num_quads; i++) {
+      shuffles[i*4+0] = shuffles[i*4+1] = index0;
+      shuffles[i*4+2] = shuffles[i*4+3] = index1;
+   }
+
+   LLVMValueRef floatdim = LLVMBuildShuffleVector(builder, float_size, float_size,
+                                                  LLVMConstVector(shuffles, length), "");
+
+   ddx_ddy = lp_build_mul(coord_bld, ddx_ddy, floatdim);
+
+   LLVMValueRef scaling = lp_build_shl(&bld->leveli_bld, bld->leveli_bld.one, ilevel0);
+   scaling = lp_build_int_to_float(&bld->levelf_bld, scaling);
+   scaling = lp_build_rcp(&bld->levelf_bld, scaling);
+
+   if (bld->num_lods != length) {
+      if (bld->levelf_bld.type.length == 1)
+         scaling = lp_build_broadcast_scalar(coord_bld,
+                                             scaling);
+      else
+         scaling = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
+                                                         bld->levelf_bld.type,
+                                                         coord_bld->type,
+                                                         scaling);
+   }
+
+   ddx_ddy = lp_build_mul(coord_bld, ddx_ddy, scaling);
+
+   static const unsigned char swizzle01[] = { /* no-op swizzle */
+      0, 1, 0, 1,
+   };
+   static const unsigned char swizzle23[] = {
+      2, 3, 2, 3,
+   };
+
+   LLVMValueRef ddx_ddys, ddx_ddyt;
+   ddx_ddys = lp_build_swizzle_aos(coord_bld, ddx_ddy, swizzle01);
+   ddx_ddyt = lp_build_swizzle_aos(coord_bld, ddx_ddy, swizzle23);
+
+   /* compute ellipse coefficients */
+   /* * A*x*x + B*x*y + C*y*y = F.*/
+   /* float A = vx*vx+vy*vy+1; */
+   LLVMValueRef A = lp_build_mul(coord_bld, ddx_ddyt, ddx_ddyt);
+
+   LLVMValueRef Ay = lp_build_swizzle_aos(coord_bld, A, swizzle15);
+   A = lp_build_add(coord_bld, A, Ay);
+   A = lp_build_add(coord_bld, A, coord_bld->one);
+   A = lp_build_swizzle_aos(coord_bld, A, swizzle04);
+
+   /* float B = -2*(ux*vx+uy*vy); */
+   LLVMValueRef B = lp_build_mul(coord_bld, ddx_ddys, ddx_ddyt);
+   LLVMValueRef By = lp_build_swizzle_aos(coord_bld, B, swizzle15);
+   B = lp_build_add(coord_bld, B, By);
+   B = lp_build_mul_imm(coord_bld, B, -2);
+   B = lp_build_swizzle_aos(coord_bld, B, swizzle04);
+
+   /* float C = ux*ux+uy*uy+1; */
+   LLVMValueRef C = lp_build_mul(coord_bld, ddx_ddys, ddx_ddys);
+   LLVMValueRef Cy = lp_build_swizzle_aos(coord_bld, C, swizzle15);
+   C = lp_build_add(coord_bld, C, Cy);
+   C = lp_build_add(coord_bld, C, coord_bld->one);
+   C = lp_build_swizzle_aos(coord_bld, C, swizzle04);
+
+   /* float F = A*C-B*B/4.0f; */
+   LLVMValueRef F = lp_build_mul(coord_bld, B, B);
+   F = lp_build_div(coord_bld, F, lp_build_const_vec(gallivm, coord_bld->type, 4.0));
+   LLVMValueRef F_p2 = lp_build_mul(coord_bld, A, C);
+   F = lp_build_sub(coord_bld, F_p2, F);
+
+   /* compute ellipse bounding box in texture space */
+   /* const float d = -B*B+4.0f*C*A; */
+   LLVMValueRef d = lp_build_sub(coord_bld, coord_bld->zero, lp_build_mul(coord_bld, B, B));
+   LLVMValueRef d_p2 = lp_build_mul(coord_bld, A, C);
+   d_p2 = lp_build_mul_imm(coord_bld, d_p2, 4);
+   d = lp_build_add(coord_bld, d, d_p2);
+
+   /* const float box_u = 2.0f / d * sqrtf(d*C*F); */
+   /* box_u -> half of bbox with   */
+   LLVMValueRef temp;
+   temp = lp_build_mul(coord_bld, d, C);
+   temp = lp_build_mul(coord_bld, temp, F);
+   temp = lp_build_sqrt(coord_bld, temp);
+
+   LLVMValueRef box_u = lp_build_div(coord_bld, lp_build_const_vec(gallivm, coord_bld->type, 2.0), d);
+   box_u = lp_build_mul(coord_bld, box_u, temp);
+
+   /* const float box_v = 2.0f / d * sqrtf(A*d*F); */
+   /* box_v -> half of bbox height */
+   temp = lp_build_mul(coord_bld, A, d);
+   temp = lp_build_mul(coord_bld, temp, F);
+   temp = lp_build_sqrt(coord_bld, temp);
+
+   LLVMValueRef box_v = lp_build_div(coord_bld, lp_build_const_vec(gallivm, coord_bld->type, 2.0), d);
+   box_v = lp_build_mul(coord_bld, box_v, temp);
+
+   /* Scale ellipse formula to directly index the Filter Lookup Table.
+    * i.e. scale so that F = WEIGHT_LUT_SIZE-1
+    */
+   LLVMValueRef formScale = lp_build_div(coord_bld, lp_build_const_vec(gallivm, coord_bld->type, WEIGHT_LUT_SIZE - 1), F);
+
+   A = lp_build_mul(coord_bld, A, formScale);
+   B = lp_build_mul(coord_bld, B, formScale);
+   C = lp_build_mul(coord_bld, C, formScale);
+   /* F *= formScale; */ /* no need to scale F as we don't use it below here */
+
+   LLVMValueRef ddq = lp_build_mul_imm(coord_bld, A, 2);
+
+   /* Heckbert MS thesis, p. 59; scan over the bounding box of the ellipse
+    * and incrementally update the value of Ax^2+Bxy*Cy^2; when this
+    * value, q, is less than F, we're inside the ellipse
+    */
+
+   LLVMValueRef float_size0 = lp_build_int_to_float(float_size_bld, bld->int_size);
+   LLVMValueRef width0 = lp_build_extract_broadcast(gallivm,
+                                                    float_size_bld->type,
+                                                    coord_bld->type,
+                                                    float_size0, index0);
+   LLVMValueRef height0 = lp_build_extract_broadcast(gallivm,
+                                                     float_size_bld->type,
+                                                     coord_bld->type,
+                                                     float_size0, index1);
+
+   /* texture->width0 * scaling */
+   width0 = lp_build_mul(coord_bld, width0, scaling);
+   /* texture->height0 * scaling */
+   height0 = lp_build_mul(coord_bld, height0, scaling);
+
+   /* tex_u = -0.5f * s[j] * texture->width0 * scaling */
+   LLVMValueRef tex_u = lp_build_mul(coord_bld, coords[0], width0);
+   tex_u = lp_build_add(coord_bld, tex_u, lp_build_const_vec(gallivm, coord_bld->type, -0.5f));
+
+   /* tex_v = -0.5f * t[j] * texture->height0 * scaling */
+   LLVMValueRef tex_v = lp_build_mul(coord_bld, coords[1], height0);
+   tex_v = lp_build_add(coord_bld, tex_v, lp_build_const_vec(gallivm, coord_bld->type, -0.5f));
+
+   /* const int u0 = (int) floorf(tex_u - box_u); */
+   LLVMValueRef u0 = lp_build_itrunc(coord_bld, lp_build_floor(coord_bld, lp_build_sub(coord_bld, tex_u, box_u)));
+   /* const int u1 = (int) ceilf(tex_u + box_u); */
+   LLVMValueRef u1 = lp_build_itrunc(coord_bld, lp_build_ceil(coord_bld, lp_build_add(coord_bld, tex_u, box_u)));
+
+   /* const int v0 = (int) floorf(tex_v - box_v); */
+   LLVMValueRef v0 = lp_build_itrunc(coord_bld, lp_build_floor(coord_bld, lp_build_sub(coord_bld, tex_v, box_v)));
+   /* const int v1 = (int) ceilf(tex_v + box_v); */
+   LLVMValueRef v1 = lp_build_itrunc(coord_bld, lp_build_ceil(coord_bld, lp_build_add(coord_bld, tex_v, box_v)));
+
+   /* const float U = u0 - tex_u; */
+   LLVMValueRef U = lp_build_sub(coord_bld, lp_build_int_to_float(coord_bld, u0), tex_u);
+
+   /* A * (2 * U + 1) */
+   LLVMValueRef dq_base = lp_build_mul_imm(coord_bld, U, 2);
+   dq_base = lp_build_add(coord_bld, dq_base, coord_bld->one);
+   dq_base = lp_build_mul(coord_bld, dq_base, A);
+
+   /* A * U * U */
+   LLVMValueRef q_base = lp_build_mul(coord_bld, U, U);
+   q_base = lp_build_mul(coord_bld, q_base, A);
+
+   LLVMValueRef colors0[4];
+   LLVMValueRef den_store = lp_build_alloca(gallivm, bld->texel_bld.vec_type, "den");
+
+   unsigned chan;
+   for (chan = 0; chan < 4; chan++)
+      colors0[chan] = lp_build_alloca(gallivm, bld->texel_bld.vec_type, "colors");
+
+   LLVMValueRef q_store, dq_store;
+   q_store = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "q");
+   dq_store = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "dq");
+
+   LLVMValueRef v_limiter = lp_build_alloca(gallivm, bld->int_coord_bld.vec_type, "v_limiter");
+   LLVMValueRef u_limiter = lp_build_alloca(gallivm, bld->int_coord_bld.vec_type, "u_limiter");
+
+   LLVMBuildStore(builder, v0, v_limiter);
+
+   /* create an LLVM loop block for the V iterator */
+   LLVMBasicBlockRef v_loop_block = lp_build_insert_new_block(gallivm, "vloop");
+
+   LLVMBuildBr(builder, v_loop_block);
+   LLVMPositionBuilderAtEnd(builder, v_loop_block);
+
+   LLVMValueRef v_val = LLVMBuildLoad(builder, v_limiter, "");
+   LLVMValueRef v_mask = LLVMBuildICmp(builder,
+                                       LLVMIntSLE,
+                                       v_val,
+                                       v1, "");
+
+   /* loop over V values. */
+   {
+      /*  const float V = v - tex_v; */
+      LLVMValueRef V = lp_build_sub(coord_bld, lp_build_int_to_float(coord_bld, v_val), tex_v);
+
+      /* float dq = dq_base + B * V; */
+      LLVMValueRef dq = lp_build_mul(coord_bld, V, B);
+      dq = lp_build_add(coord_bld, dq, dq_base);
+
+      /* float q = (C * V + B * U) * V + q_base */
+      LLVMValueRef q = lp_build_mul(coord_bld, C, V);
+      q = lp_build_add(coord_bld, q, lp_build_mul(coord_bld, B, U));
+      q = lp_build_mul(coord_bld, q, V);
+      q = lp_build_add(coord_bld, q, q_base);
+
+      LLVMBuildStore(builder, q, q_store);
+      LLVMBuildStore(builder, dq, dq_store);
+
+      LLVMBuildStore(builder, u0, u_limiter);
+
+      /* create an LLVM loop block for the V iterator */
+      LLVMBasicBlockRef u_loop_block = lp_build_insert_new_block(gallivm, "uloop");
+
+      LLVMBuildBr(builder, u_loop_block);
+      LLVMPositionBuilderAtEnd(builder, u_loop_block);
+
+      LLVMValueRef u_val = LLVMBuildLoad(builder, u_limiter, "");
+      LLVMValueRef u_mask = LLVMBuildICmp(builder,
+                                          LLVMIntSLE,
+                                          u_val,
+                                          u1, "");
+
+      /* loop over U values */
+      {
+         /* q = (int)q */
+         q = lp_build_itrunc(coord_bld, LLVMBuildLoad(builder, q_store, ""));
+
+         /*
+          * avoid OOB access to filter table, generate a mask for q > 1024,
+          * then truncate it.
+          */
+         LLVMValueRef q_mask = LLVMBuildICmp(builder,
+                                             LLVMIntSLE,
+                                             q,
+                                             lp_build_const_int_vec(gallivm, bld->int_coord_bld.type, 0x3ff), "");
+         q_mask = LLVMBuildSExt(builder, q_mask, bld->int_coord_bld.vec_type, "");
+
+         q = lp_build_max(&bld->int_coord_bld, q, bld->int_coord_bld.zero);
+         q = lp_build_and(&bld->int_coord_bld, q, lp_build_const_int_vec(gallivm, bld->int_coord_bld.type, 0x3ff));
+
+         /* update the offsets to deal with float size. */
+         q = lp_build_mul_imm(&bld->int_coord_bld, q, 4);
+         filter_table = LLVMBuildBitCast(gallivm->builder, filter_table, LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
+
+         /* Lookup weights in filter table */
+         LLVMValueRef weights = lp_build_gather(gallivm, coord_bld->type.length,
+                                                coord_bld->type.width,
+                                                lp_elem_type(coord_bld->type),
+                                                TRUE, filter_table, q, TRUE);
+
+         /*
+          * Mask off the weights here which should ensure no-op for loops
+          * where some of the u/v values are not being calculated.
+          */
+         weights = LLVMBuildBitCast(builder, weights, bld->int_coord_bld.vec_type, "");
+         weights = lp_build_and(&bld->int_coord_bld, weights, LLVMBuildSExt(builder, u_mask, bld->int_coord_bld.vec_type, ""));
+         weights = lp_build_and(&bld->int_coord_bld, weights, LLVMBuildSExt(builder, v_mask, bld->int_coord_bld.vec_type, ""));
+         weights = lp_build_and(&bld->int_coord_bld, weights, q_mask);
+         weights = LLVMBuildBitCast(builder, weights, bld->coord_bld.vec_type, "");
+
+         /* if the weights are all 0 avoid doing the sampling at all. */
+         struct lp_build_if_state noloadw0;
+
+         LLVMValueRef wnz = LLVMBuildFCmp(gallivm->builder, LLVMRealUNE,
+                                          weights, bld->coord_bld.zero, "");
+         wnz = LLVMBuildSExt(builder, wnz, bld->int_coord_bld.vec_type, "");
+         wnz = lp_build_any_true_range(&bld->coord_bld, bld->coord_bld.type.length, wnz);
+         lp_build_if(&noloadw0, gallivm, wnz);
+         LLVMValueRef new_coords[3];
+         new_coords[0] = lp_build_div(coord_bld, lp_build_int_to_float(coord_bld, u_val), width_dim);
+         new_coords[1] = lp_build_div(coord_bld, lp_build_int_to_float(coord_bld, v_val), height_dim);
+         new_coords[2] = coords[2];
+
+         /* lookup q in filter table */
+         LLVMValueRef temp_colors[4];
+         lp_build_sample_image_nearest(bld, size0,
+                                       row_stride0_vec, img_stride0_vec,
+                                       data_ptr0, mipoff0, new_coords, offsets,
+                                       temp_colors);
+
+         for (chan = 0; chan < 4; chan++) {
+            LLVMValueRef tcolor = LLVMBuildLoad(builder, colors0[chan], "");
+
+            tcolor = lp_build_add(&bld->texel_bld, tcolor, lp_build_mul(&bld->texel_bld, temp_colors[chan], weights));
+            LLVMBuildStore(builder, tcolor, colors0[chan]);
+         }
+
+         /* multiple colors by weight and add in. */
+         /* den += weight; */
+         LLVMValueRef den = LLVMBuildLoad(builder, den_store, "");
+         den = lp_build_add(&bld->texel_bld, den, weights);
+         LLVMBuildStore(builder, den, den_store);
+
+         lp_build_endif(&noloadw0);
+         /* q += dq; */
+         /* dq += ddq; */
+         q = LLVMBuildLoad(builder, q_store, "");
+         dq = LLVMBuildLoad(builder, dq_store, "");
+         q = lp_build_add(coord_bld, q, dq);
+         dq = lp_build_add(coord_bld, dq, ddq);
+         LLVMBuildStore(builder, q, q_store);
+         LLVMBuildStore(builder, dq, dq_store);
+      }
+      /* u += 1 */
+      u_val = LLVMBuildLoad(builder, u_limiter, "");
+      u_val = lp_build_add(&bld->int_coord_bld, u_val, bld->int_coord_bld.one);
+      LLVMBuildStore(builder, u_val, u_limiter);
+
+      u_mask = LLVMBuildICmp(builder,
+                             LLVMIntSLE,
+                             u_val,
+                             u1, "");
+      LLVMValueRef u_end_cond = LLVMBuildSExt(builder, u_mask, bld->int_coord_bld.vec_type, "");
+      u_end_cond = lp_build_any_true_range(&bld->coord_bld, bld->coord_bld.type.length, u_end_cond);
+
+      LLVMBasicBlockRef u_end_loop = lp_build_insert_new_block(gallivm, "u_end_loop");
+
+      LLVMBuildCondBr(builder, u_end_cond,
+                      u_loop_block, u_end_loop);
+
+      LLVMPositionBuilderAtEnd(builder, u_end_loop);
+
+   }
+
+   /* v += 1 */
+   v_val = LLVMBuildLoad(builder, v_limiter, "");
+   v_val = lp_build_add(&bld->int_coord_bld, v_val, bld->int_coord_bld.one);
+   LLVMBuildStore(builder, v_val, v_limiter);
+
+   v_mask = LLVMBuildICmp(builder,
+                          LLVMIntSLE,
+                          v_val,
+                          v1, "");
+   LLVMValueRef v_end_cond = LLVMBuildSExt(builder, v_mask, bld->int_coord_bld.vec_type, "");
+   v_end_cond = lp_build_any_true_range(&bld->coord_bld, bld->coord_bld.type.length, v_end_cond);
+
+   LLVMBasicBlockRef v_end_loop = lp_build_insert_new_block(gallivm, "v_end_loop");
+
+   LLVMBuildCondBr(builder, v_end_cond,
+                   v_loop_block, v_end_loop);
+
+   LLVMPositionBuilderAtEnd(builder, v_end_loop);
+
+   LLVMValueRef den = LLVMBuildLoad(builder, den_store, "");
+
+   for (chan = 0; chan < 4; chan++)
+      colors0[chan] = lp_build_div(&bld->texel_bld, LLVMBuildLoad(builder, colors0[chan], ""), den);
+   LLVMValueRef den0 = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_EQUAL, den, bld->coord_bld.zero);
+
+   LLVMValueRef den0_any = lp_build_any_true_range(&bld->coord_bld, bld->coord_bld.type.length, den0);
+
+   struct lp_build_if_state den0_fallback;
+   lp_build_if(&den0_fallback, gallivm, den0_any);
+
+   LLVMValueRef colors_den0[4];
+   lp_build_sample_image_linear(bld, false, size0, NULL,
+                                 row_stride0_vec, img_stride0_vec,
+                                 data_ptr0, mipoff0, coords, offsets,
+                                 colors_den0);
+   for (chan = 0; chan < 4; chan++) {
+      LLVMValueRef chan_val = lp_build_select(&bld->texel_bld, den0, colors_den0[chan], colors0[chan]);
+      LLVMBuildStore(builder, chan_val, colors_out[chan]);
+   }
+   lp_build_else(&den0_fallback);
+   for (chan = 0; chan < 4; chan++)
+      LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
+   lp_build_endif(&den0_fallback);
+}
 
 /**
  * Calculate cube face, lod, mip levels.
@@ -2090,6 +2513,7 @@ lp_build_sample_common(struct lp_build_sample_context *bld,
    const unsigned min_filter = bld->static_sampler_state->min_img_filter;
    const unsigned mag_filter = bld->static_sampler_state->mag_img_filter;
    const unsigned target = bld->static_texture_state->target;
+   const bool aniso = bld->static_sampler_state->aniso;
    LLVMValueRef first_level, cube_rho = NULL;
    LLVMValueRef lod_ipart = NULL;
    struct lp_derivatives cube_derivs;
@@ -2155,13 +2579,21 @@ lp_build_sample_common(struct lp_build_sample_context *bld,
     */
    if (min_filter != mag_filter ||
        mip_filter != PIPE_TEX_MIPFILTER_NONE || is_lodq) {
+      LLVMValueRef max_aniso = NULL;
+
+      if (aniso)
+         max_aniso = bld->dynamic_state->max_aniso(bld->dynamic_state,
+                                                   bld->gallivm,
+                                                   bld->context_ptr,
+                                                   sampler_index);
+
       /* Need to compute lod either to choose mipmap levels or to
        * distinguish between minification/magnification with one mipmap level.
        */
       lp_build_lod_selector(bld, is_lodq, texture_index, sampler_index,
                             coords[0], coords[1], coords[2], cube_rho,
                             derivs, lod_bias, explicit_lod,
-                            mip_filter, lod,
+                            mip_filter, max_aniso, lod,
                             &lod_ipart, lod_fpart, lod_pos_or_zero);
       if (is_lodq) {
          LLVMValueRef last_level;
@@ -2197,7 +2629,8 @@ lp_build_sample_common(struct lp_build_sample_context *bld,
       *lod_pos_or_zero = bld->lodi_bld.zero;
    }
 
-   if (bld->num_lods != bld->num_mips) {
+   if ((bld->num_lods != bld->num_mips || bld->num_lods == 1) &&
+       bld->lodi_bld.type.length != 1) {
       /* only makes sense if there's just a single mip level */
       assert(bld->num_mips == 1);
       lod_ipart = lp_build_extract_range(bld->gallivm, lod_ipart, 0, 1);
@@ -2206,6 +2639,12 @@ lp_build_sample_common(struct lp_build_sample_context *bld,
    /*
     * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
     */
+
+   if (aniso) {
+      lp_build_nearest_mip_level(bld, texture_index, lod_ipart, ilevel0, NULL);
+      return;
+   }
+
    switch (mip_filter) {
    default:
       debug_assert(0 && "bad mip_filter value in lp_build_sample_soa()");
@@ -2512,7 +2951,11 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
      lp_build_name(texels[chan], "sampler%u_texel_%c_var", sampler_unit, "xyzw"[chan]);
    }
 
-   if (min_filter == mag_filter) {
+   if (sampler_state->aniso) {
+      lp_build_sample_aniso(bld, PIPE_TEX_FILTER_NEAREST, mip_filter,
+                            false, coords, offsets, ilevel0,
+                            ilevel1, lod_fpart, texels);
+   } else if (min_filter == mag_filter) {
       /* no need to distinguish between minification and magnification */
       lp_build_sample_mipmap(bld, min_filter, mip_filter,
                              is_gather,
@@ -3183,6 +3626,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
                 op_is_tex &&
                 /* not sure this is strictly needed or simply impossible */
                 derived_sampler_state.compare_mode == PIPE_TEX_COMPARE_NONE &&
+                derived_sampler_state.aniso == 0 &&
                 lp_is_simple_wrap_mode(derived_sampler_state.wrap_s);
 
       use_aos &= bld.num_lods <= num_quads ||