[Mesa-dev] [PATCH 1/2] gallivm: implement seamless cube filtering

Mon Oct 21 12:21:49 CEST 2013

Looks great AFAICT.

Jose

----- Original Message -----
> From: Roland Scheidegger <sroland at vmware.com>
> 
> For seamless cube filtering it is necessary to determine new faces and new
> coords per sample. The logic for this is _seriously_ complex (what needs
> to happen is very "asymmetric" wrt face, x/y under/overflow), further
> complicated by the fact that if the 4 samples are in a corner (meaning we
> only have actually 3 samples, and all 3 are on different faces) then
> falling off the edge is happening _both_ on x and y axis simultaneously.
> There was a noticeable performance hit in mesa's cubemap demo when seamless
> filtering was forced on (just below 10 percent or so in a debug build, when
> disabling all filtering hacks, otherwise it would probably be a bit more) and
> when always doing the logic, hence use a branch which it only does it if any
> of the pixels in a quad (or in two quads) actually hit this. With that there
> was no measurable performance hit in the cubemap demo (neither in a debug nor
> release buidl), but this will vary (cubemap demo very rarely hits edges).
> Might also be different on other cpus, as this forces SoA sampling path which
> potentially can be quite a bit slower.
> Note that as for corners, this code gets all the 3 samples which actually
> exist right, and the 4th texel will simply be the same as one of the others,
> meaning that filter weights will be a bit wrong. This however should be
> enough for full OpenGL (but not d3d10) compliance.
> ---
>  src/gallium/auxiliary/gallivm/lp_bld_sample.c     |  138 +++++++++++
>  src/gallium/auxiliary/gallivm/lp_bld_sample.h     |   13 ++
>  src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |  257
>  +++++++++++++++++----
>  3 files changed, 368 insertions(+), 40 deletions(-)
> 
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
> b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
> index 1c35200..a032d9d 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
> @@ -1402,6 +1402,144 @@ lp_build_unnormalized_coords(struct
> lp_build_sample_context *bld,
>     }
>  }
>  
> +/**
> + * Generate new coords and faces for cubemap texels falling off the face.
> + *
> + * @param face   face (center) of the pixel
> + * @param x0     lower x coord
> + * @param x1     higher x coord (must be x0 + 1)
> + * @param y0     lower y coord
> + * @param y1     higher y coord (must be x0 + 1)
> + * @param max_coord     texture cube (level) size - 1
> + * @param next_faces    new face values when falling off
> + * @param next_xcoords  new x coord values when falling off
> + * @param next_ycoords  new y coord values when falling off
> + *
> + * The arrays hold the new values when under/overflow of
> + * lower x, higher x, lower y, higher y coord would occur (in this order).
> + * next_xcoords/next_ycoords have two entries each (for both new lower and
> + * higher coord).
> + */
> +void
> +lp_build_cube_new_coords(struct lp_build_context *ivec_bld,
> +                        LLVMValueRef face,
> +                        LLVMValueRef x0,
> +                        LLVMValueRef x1,
> +                        LLVMValueRef y0,
> +                        LLVMValueRef y1,
> +                        LLVMValueRef max_coord,
> +                        LLVMValueRef next_faces[4],
> +                        LLVMValueRef next_xcoords[4][2],
> +                        LLVMValueRef next_ycoords[4][2])
> +{
> +   /*
> +    * Lookup tables aren't nice for simd code hence try some logic here.
> +    * (Note that while it would not be necessary to do per-sample (4)
> lookups
> +    * when using a LUT as it's impossible that texels fall off of positive
> +    * and negative edges simultaneously, it would however be necessary to
> +    * do 2 lookups for corner handling as in this case texels both fall off
> +    * of x and y axes.)
> +    */
> +   /*
> +    * Next faces (for face 012345):
> +    * x < 0.0  : 451110
> +    * x >= 1.0 : 540001
> +    * y < 0.0  : 225422
> +    * y >= 1.0 : 334533
> +    * Hence nfx+ (and nfy+) == nfx- (nfy-) xor 1
> +    * nfx-: face > 1 ? (face == 5 ? 0 : 1) : (4 + face & 1)
> +    * nfy+: face & ~4 > 1 ? face + 2 : 3;
> +    * This could also use pshufb instead, but would need (manually coded)
> +    * ssse3 intrinsic (llvm won't do non-constant shuffles).
> +    */
> +   struct gallivm_state *gallivm = ivec_bld->gallivm;
> +   LLVMValueRef sel, sel_f2345, sel_f23, sel_f2, tmpsel, tmp;
> +   LLVMValueRef faceand1, sel_fand1, maxmx0, maxmx1, maxmy0, maxmy1;
> +   LLVMValueRef c2 = lp_build_const_int_vec(gallivm, ivec_bld->type, 2);
> +   LLVMValueRef c3 = lp_build_const_int_vec(gallivm, ivec_bld->type, 3);
> +   LLVMValueRef c4 = lp_build_const_int_vec(gallivm, ivec_bld->type, 4);
> +   LLVMValueRef c5 = lp_build_const_int_vec(gallivm, ivec_bld->type, 5);
> +
> +   sel = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, face, c5);
> +   tmpsel = lp_build_select(ivec_bld, sel, ivec_bld->zero, ivec_bld->one);
> +   sel_f2345 = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, face,
> ivec_bld->one);
> +   faceand1 = lp_build_and(ivec_bld, face, ivec_bld->one);
> +   tmp = lp_build_add(ivec_bld, faceand1, c4);
> +   next_faces[0] = lp_build_select(ivec_bld, sel_f2345, tmpsel, tmp);
> +   next_faces[1] = lp_build_xor(ivec_bld, next_faces[0], ivec_bld->one);
> +
> +   tmp = lp_build_andnot(ivec_bld, face, c4);
> +   sel_f23 = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, tmp, ivec_bld->one);
> +   tmp = lp_build_add(ivec_bld, face, c2);
> +   next_faces[3] = lp_build_select(ivec_bld, sel_f23, tmp, c3);
> +   next_faces[2] = lp_build_xor(ivec_bld, next_faces[3], ivec_bld->one);
> +
> +   /*
> +    * new xcoords (for face 012345):
> +    * x < 0.0  : max   max   t     max-t max  max
> +    * x >= 1.0 : 0     0     max-t t     0    0
> +    * y < 0.0  : max   0     max-s s     s    max-s
> +    * y >= 1.0 : max   0     s     max-s s    max-s
> +    *
> +    * ncx[1] = face & ~4 > 1 ? (face == 2 ? max-t : t) : 0
> +    * ncx[0] = max - ncx[1]
> +    * ncx[3] = face > 1 ? (face & 1 ? max-s : s) : (face & 1) ? 0 : max
> +    * ncx[2] = face & ~4 > 1 ? max - ncx[3] : ncx[3]
> +    */
> +   sel_f2 = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, face, c2);
> +   maxmy0 = lp_build_sub(ivec_bld, max_coord, y0);
> +   tmp = lp_build_select(ivec_bld, sel_f2, maxmy0, y0);
> +   next_xcoords[1][0] = lp_build_select(ivec_bld, sel_f23, tmp,
> ivec_bld->zero);
> +   next_xcoords[0][0] = lp_build_sub(ivec_bld, max_coord,
> next_xcoords[1][0]);
> +   maxmy1 = lp_build_sub(ivec_bld, max_coord, y1);
> +   tmp = lp_build_select(ivec_bld, sel_f2, maxmy1, y1);
> +   next_xcoords[1][1] = lp_build_select(ivec_bld, sel_f23, tmp,
> ivec_bld->zero);
> +   next_xcoords[0][1] = lp_build_sub(ivec_bld, max_coord,
> next_xcoords[1][1]);
> +
> +   sel_fand1 = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, faceand1,
> ivec_bld->one);
> +
> +   tmpsel = lp_build_select(ivec_bld, sel_fand1, ivec_bld->zero, max_coord);
> +   maxmx0 = lp_build_sub(ivec_bld, max_coord, x0);
> +   tmp = lp_build_select(ivec_bld, sel_fand1, maxmx0, x0);
> +   next_xcoords[3][0] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
> +   tmp = lp_build_sub(ivec_bld, max_coord, next_xcoords[3][0]);
> +   next_xcoords[2][0] = lp_build_select(ivec_bld, sel_f23, tmp,
> next_xcoords[3][0]);
> +   maxmx1 = lp_build_sub(ivec_bld, max_coord, x1);
> +   tmp = lp_build_select(ivec_bld, sel_fand1, maxmx1, x1);
> +   next_xcoords[3][1] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
> +   tmp = lp_build_sub(ivec_bld, max_coord, next_xcoords[3][1]);
> +   next_xcoords[2][1] = lp_build_select(ivec_bld, sel_f23, tmp,
> next_xcoords[3][1]);
> +
> +   /*
> +    * new ycoords (for face 012345):
> +    * x < 0.0  : t     t     0     max   t    t
> +    * x >= 1.0 : t     t     0     max   t    t
> +    * y < 0.0  : max-s s     0     max   max  0
> +    * y >= 1.0 : s     max-s 0     max   0    max
> +    *
> +    * ncy[0] = face & ~4 > 1 ? (face == 2 ? 0 : max) : t
> +    * ncy[1] = ncy[0]
> +    * ncy[3] = face > 1 ? (face & 1 ? max : 0) : (face & 1) ? max-s : max
> +    * ncx[2] = face & ~4 > 1 ? max - ncx[3] : ncx[3]
> +    */
> +   tmp = lp_build_select(ivec_bld, sel_f2, ivec_bld->zero, max_coord);
> +   next_ycoords[0][0] = lp_build_select(ivec_bld, sel_f23, tmp, y0);
> +   next_ycoords[1][0] = next_ycoords[0][0];
> +   next_ycoords[0][1] = lp_build_select(ivec_bld, sel_f23, tmp, y1);
> +   next_ycoords[1][1] = next_ycoords[0][1];
> +
> +   tmpsel = lp_build_select(ivec_bld, sel_fand1, maxmx0, x0);
> +   tmp = lp_build_select(ivec_bld, sel_fand1, max_coord, ivec_bld->zero);
> +   next_ycoords[3][0] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
> +   tmp = lp_build_sub(ivec_bld, max_coord, next_ycoords[3][0]);
> +   next_ycoords[2][0] = lp_build_select(ivec_bld, sel_f23,
> next_ycoords[3][0], tmp);
> +   tmpsel = lp_build_select(ivec_bld, sel_fand1, maxmx1, x1);
> +   tmp = lp_build_select(ivec_bld, sel_fand1, max_coord, ivec_bld->zero);
> +   next_ycoords[3][1] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
> +   tmp = lp_build_sub(ivec_bld, max_coord, next_ycoords[3][1]);
> +   next_ycoords[2][1] = lp_build_select(ivec_bld, sel_f23,
> next_ycoords[3][1], tmp);
> +}
> +
>  
>  /** Helper used by lp_build_cube_lookup() */
>  static LLVMValueRef
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
> b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
> index 70f0350..5039128 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
> @@ -464,6 +464,19 @@ lp_build_cube_lookup(struct lp_build_sample_context
> *bld,
>  
>  
>  void
> +lp_build_cube_new_coords(struct lp_build_context *ivec_bld,
> +                         LLVMValueRef face,
> +                         LLVMValueRef x0,
> +                         LLVMValueRef x1,
> +                         LLVMValueRef y0,
> +                         LLVMValueRef y1,
> +                         LLVMValueRef max_coord,
> +                         LLVMValueRef new_faces[4],
> +                         LLVMValueRef new_xcoords[4][2],
> +                         LLVMValueRef new_ycoords[4][2]);
> +
> +
> +void
>  lp_build_sample_partial_offset(struct lp_build_context *bld,
>                                 unsigned block_length,
>                                 LLVMValueRef coord,
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
> b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
> index 54dee25..8e2d0d9 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
> @@ -848,10 +848,14 @@ lp_build_sample_image_linear(struct
> lp_build_sample_context *bld,
>     LLVMValueRef flt_width_vec;
>     LLVMValueRef flt_height_vec;
>     LLVMValueRef flt_depth_vec;
> -   LLVMValueRef x0, y0 = NULL, z0 = NULL, x1, y1 = NULL, z1 = NULL;
> +   LLVMValueRef z1 = NULL;
> +   LLVMValueRef z00 = NULL, z01 = NULL, z10 = NULL, z11 = NULL;
> +   LLVMValueRef x00 = NULL, x01 = NULL, x10 = NULL, x11 = NULL;
> +   LLVMValueRef y00 = NULL, y01 = NULL, y10 = NULL, y11 = NULL;
>     LLVMValueRef s_fpart, t_fpart = NULL, r_fpart = NULL;
> +   LLVMValueRef xs[4], ys[4], zs[4];
>     LLVMValueRef neighbors[2][2][4];
> -   int chan;
> +   int chan, texel_index;
>  
>     lp_build_extract_image_sizes(bld,
>                                  &bld->int_size_bld,
> @@ -870,39 +874,202 @@ lp_build_sample_image_linear(struct
> lp_build_sample_context *bld,
>     /*
>      * Compute integer texcoords.
>      */
> -   lp_build_sample_wrap_linear(bld, coords[0], width_vec,
> -                               flt_width_vec, offsets[0],
> -                               bld->static_texture_state->pot_width,
> -                               bld->static_sampler_state->wrap_s,
> -                               &x0, &x1, &s_fpart);
> -   lp_build_name(x0, "tex.x0.wrapped");
> -   lp_build_name(x1, "tex.x1.wrapped");
>  
> -   if (dims >= 2) {
> -      lp_build_sample_wrap_linear(bld, coords[1], height_vec,
> -                                  flt_height_vec, offsets[1],
> -                                  bld->static_texture_state->pot_height,
> -                                  bld->static_sampler_state->wrap_t,
> -                                  &y0, &y1, &t_fpart);
> -      lp_build_name(y0, "tex.y0.wrapped");
> -      lp_build_name(y1, "tex.y1.wrapped");
> +   if (bld->static_texture_state->target != PIPE_TEXTURE_CUBE ||
> +       !bld->static_sampler_state->seamless_cube_map) {
> +      lp_build_sample_wrap_linear(bld, coords[0], width_vec,
> +                                  flt_width_vec, offsets[0],
> +                                  bld->static_texture_state->pot_width,
> +                                  bld->static_sampler_state->wrap_s,
> +                                  &x00, &x01, &s_fpart);
> +      lp_build_name(x00, "tex.x0.wrapped");
> +      lp_build_name(x01, "tex.x1.wrapped");
> +      x10 = x00;
> +      x11 = x01;
>  
> -      if (dims == 3) {
> -         lp_build_sample_wrap_linear(bld, coords[2], depth_vec,
> -                                     flt_depth_vec, offsets[2],
> -                                     bld->static_texture_state->pot_depth,
> -                                     bld->static_sampler_state->wrap_r,
> -                                     &z0, &z1, &r_fpart);
> -         lp_build_name(z0, "tex.z0.wrapped");
> -         lp_build_name(z1, "tex.z1.wrapped");
> +      if (dims >= 2) {
> +         lp_build_sample_wrap_linear(bld, coords[1], height_vec,
> +                                     flt_height_vec, offsets[1],
> +                                     bld->static_texture_state->pot_height,
> +                                     bld->static_sampler_state->wrap_t,
> +                                     &y00, &y10, &t_fpart);
> +         lp_build_name(y00, "tex.y0.wrapped");
> +         lp_build_name(y10, "tex.y1.wrapped");
> +         y01 = y00;
> +         y11 = y10;
> +
> +         if (dims == 3) {
> +            lp_build_sample_wrap_linear(bld, coords[2], depth_vec,
> +                                        flt_depth_vec, offsets[2],
> +
> bld->static_texture_state->pot_depth,
> +                                        bld->static_sampler_state->wrap_r,
> +                                        &z00, &z1, &r_fpart);
> +            z01 = z10 = z11 = z00;
> +            lp_build_name(z00, "tex.z0.wrapped");
> +            lp_build_name(z1, "tex.z1.wrapped");
> +         }
> +      }
> +      if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
> +          bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
> +          bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
> +         z00 = z01 = z10 = z11 = z1 = coords[2];  /* cube face or layer */
> +         lp_build_name(z00, "tex.z0.layer");
> +         lp_build_name(z1, "tex.z1.layer");
>        }
>     }
> -   if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
> -       bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
> -       bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
> -      z0 = z1 = coords[2];  /* cube face or layer */
> -      lp_build_name(z0, "tex.z0.layer");
> -      lp_build_name(z1, "tex.z1.layer");
> +   else {
> +      LLVMBuilderRef builder = bld->gallivm->builder;
> +      struct lp_build_context *ivec_bld = &bld->int_coord_bld;
> +      struct lp_build_context *coord_bld = &bld->coord_bld;
> +      struct lp_build_if_state edge_if;
> +      LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
> +      LLVMValueRef fall_off[4], coord, have_edge;
> +      LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp;
> +      LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
> +      LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
> +      LLVMValueRef face = coords[2];
> +      LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type,
> 0.5f);
> +      LLVMValueRef length_minus_one = lp_build_sub(ivec_bld, width_vec,
> ivec_bld->one);
> +      /* XXX drop height calcs. Could (should) do this without seamless
> filtering too */
> +      height_vec = width_vec;
> +      flt_height_vec = flt_width_vec;
> +
> +      /* XXX the overflow logic is actually sort of duplicated with
> trilinear,
> +       * since an overflow in one mip should also have a corresponding
> overflow
> +       * in another.
> +       */
> +      /* should always have normalized coords, and offsets are undefined */
> +      assert(bld->static_sampler_state->normalized_coords);
> +      coord = lp_build_mul(coord_bld, coords[0], flt_width_vec);
> +      /* instead of clamp, build mask if overflowed */
> +      coord = lp_build_sub(coord_bld, coord, half);
> +      /* convert to int, compute lerp weight */
> +      /* not ideal with AVX (and no AVX2) */
> +      lp_build_ifloor_fract(coord_bld, coord, &x0, &s_fpart);
> +      x1 = lp_build_add(ivec_bld, x0, ivec_bld->one);
> +      coord = lp_build_mul(coord_bld, coords[1], flt_height_vec);
> +      coord = lp_build_sub(coord_bld, coord, half);
> +      lp_build_ifloor_fract(coord_bld, coord, &y0, &t_fpart);
> +      y1 = lp_build_add(ivec_bld, y0, ivec_bld->one);
> +
> +      fall_off[0] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, x0,
> ivec_bld->zero);
> +      fall_off[1] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, x1,
> length_minus_one);
> +      fall_off[2] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, y0,
> ivec_bld->zero);
> +      fall_off[3] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, y1,
> length_minus_one);
> +
> +      have_edge = lp_build_or(ivec_bld, fall_off[0], fall_off[1]);
> +      have_edge = lp_build_or(ivec_bld, have_edge, fall_off[2]);
> +      have_edge = lp_build_or(ivec_bld, have_edge, fall_off[3]);
> +
> +      have_edge = lp_build_any_true_range(ivec_bld, ivec_bld->type.length,
> have_edge);
> +
> +      for (texel_index = 0; texel_index < 4; texel_index++) {
> +         xs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type,
> "xs");
> +         ys[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type,
> "ys");
> +         zs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type,
> "zs");
> +      }
> +
> +      lp_build_if(&edge_if, bld->gallivm, have_edge);
> +
> +      /*
> +       * Need to feed clamped values here for cheap corner handling,
> +       * but only for y coord (as when falling off both edges we only
> +       * fall off the x one) - this should be sufficient.
> +       */
> +      y0_clamped = lp_build_max(ivec_bld, y0, ivec_bld->zero);
> +      y1_clamped = lp_build_min(ivec_bld, y1, length_minus_one);
> +
> +      /*
> +       * Get all possible new coords.
> +       */
> +      lp_build_cube_new_coords(ivec_bld, face,
> +                               x0, x1, y0_clamped, y1_clamped,
> +                               length_minus_one,
> +                               new_faces, new_xcoords, new_ycoords);
> +
> +      /* handle fall off x-, x+ direction */
> +      /* determine new coords, face (not both fall_off vars can be true at
> same time) */
> +      x00 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][0], x0);
> +      y00 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][0],
> y0_clamped);
> +      x10 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][1], x0);
> +      y10 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][1],
> y1_clamped);
> +      x01 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][0], x1);
> +      y01 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][0],
> y0_clamped);
> +      x11 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][1], x1);
> +      y11 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][1],
> y1_clamped);
> +
> +      z00 = z10 = lp_build_select(ivec_bld, fall_off[0], new_faces[0],
> face);
> +      z01 = z11 = lp_build_select(ivec_bld, fall_off[1], new_faces[1],
> face);
> +
> +      /* handle fall off y-, y+ direction */
> +      /*
> +       * Cheap corner logic: just hack up things so a texel doesn't fall
> +       * off both sides (which means filter weights will be wrong but we'll
> only
> +       * use valid texels in the filter).
> +       * This means however (y) coords must additionally be clamped (see
> above).
> +       * This corner handling should be fully OpenGL (but not d3d10)
> compliant.
> +       */
> +      fall_off_ym_notxm = lp_build_andnot(ivec_bld, fall_off[2],
> fall_off[0]);
> +      fall_off_ym_notxp = lp_build_andnot(ivec_bld, fall_off[2],
> fall_off[1]);
> +      fall_off_yp_notxm = lp_build_andnot(ivec_bld, fall_off[3],
> fall_off[0]);
> +      fall_off_yp_notxp = lp_build_andnot(ivec_bld, fall_off[3],
> fall_off[1]);
> +
> +      x00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_xcoords[2][0],
> x00);
> +      y00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_ycoords[2][0],
> y00);
> +      x01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_xcoords[2][1],
> x01);
> +      y01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_ycoords[2][1],
> y01);
> +      x10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_xcoords[3][0],
> x10);
> +      y10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_ycoords[3][0],
> y10);
> +      x11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_xcoords[3][1],
> x11);
> +      y11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_ycoords[3][1],
> y11);
> +
> +      z00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_faces[2], z00);
> +      z01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_faces[2], z01);
> +      z10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_faces[3], z10);
> +      z11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_faces[3], z11);
> +
> +      LLVMBuildStore(builder, x00, xs[0]);
> +      LLVMBuildStore(builder, x01, xs[1]);
> +      LLVMBuildStore(builder, x10, xs[2]);
> +      LLVMBuildStore(builder, x11, xs[3]);
> +      LLVMBuildStore(builder, y00, ys[0]);
> +      LLVMBuildStore(builder, y01, ys[1]);
> +      LLVMBuildStore(builder, y10, ys[2]);
> +      LLVMBuildStore(builder, y11, ys[3]);
> +      LLVMBuildStore(builder, z00, zs[0]);
> +      LLVMBuildStore(builder, z01, zs[1]);
> +      LLVMBuildStore(builder, z10, zs[2]);
> +      LLVMBuildStore(builder, z11, zs[3]);
> +
> +      lp_build_else(&edge_if);
> +
> +      LLVMBuildStore(builder, x0, xs[0]);
> +      LLVMBuildStore(builder, x1, xs[1]);
> +      LLVMBuildStore(builder, x0, xs[2]);
> +      LLVMBuildStore(builder, x1, xs[3]);
> +      LLVMBuildStore(builder, y0, ys[0]);
> +      LLVMBuildStore(builder, y0, ys[1]);
> +      LLVMBuildStore(builder, y1, ys[2]);
> +      LLVMBuildStore(builder, y1, ys[3]);
> +      LLVMBuildStore(builder, face, zs[0]);
> +      LLVMBuildStore(builder, face, zs[1]);
> +      LLVMBuildStore(builder, face, zs[2]);
> +      LLVMBuildStore(builder, face, zs[3]);
> +
> +      lp_build_endif(&edge_if);
> +
> +      x00 = LLVMBuildLoad(builder, xs[0], "");
> +      x01 = LLVMBuildLoad(builder, xs[1], "");
> +      x10 = LLVMBuildLoad(builder, xs[2], "");
> +      x11 = LLVMBuildLoad(builder, xs[3], "");
> +      y00 = LLVMBuildLoad(builder, ys[0], "");
> +      y01 = LLVMBuildLoad(builder, ys[1], "");
> +      y10 = LLVMBuildLoad(builder, ys[2], "");
> +      y11 = LLVMBuildLoad(builder, ys[3], "");
> +      z00 = LLVMBuildLoad(builder, zs[0], "");
> +      z01 = LLVMBuildLoad(builder, zs[1], "");
> +      z10 = LLVMBuildLoad(builder, zs[2], "");
> +      z11 = LLVMBuildLoad(builder, zs[3], "");
>     }
>  
>     if (linear_mask) {
> @@ -937,12 +1104,12 @@ lp_build_sample_image_linear(struct
> lp_build_sample_context *bld,
>     /* get x0/x1 texels */
>     lp_build_sample_texel_soa(bld,
>                               width_vec, height_vec, depth_vec,
> -                             x0, y0, z0,
> +                             x00, y00, z00,
>                               row_stride_vec, img_stride_vec,
>                               data_ptr, mipoffsets, neighbors[0][0]);
>     lp_build_sample_texel_soa(bld,
>                               width_vec, height_vec, depth_vec,
> -                             x1, y0, z0,
> +                             x01, y01, z01,
>                               row_stride_vec, img_stride_vec,
>                               data_ptr, mipoffsets, neighbors[0][1]);
>  
> @@ -973,12 +1140,12 @@ lp_build_sample_image_linear(struct
> lp_build_sample_context *bld,
>        /* get x0/x1 texels at y1 */
>        lp_build_sample_texel_soa(bld,
>                                  width_vec, height_vec, depth_vec,
> -                                x0, y1, z0,
> +                                x10, y10, z10,
>                                  row_stride_vec, img_stride_vec,
>                                  data_ptr, mipoffsets, neighbors[1][0]);
>        lp_build_sample_texel_soa(bld,
>                                  width_vec, height_vec, depth_vec,
> -                                x1, y1, z0,
> +                                x11, y11, z11,
>                                  row_stride_vec, img_stride_vec,
>                                  data_ptr, mipoffsets, neighbors[1][1]);
>  
> @@ -1012,22 +1179,22 @@ lp_build_sample_image_linear(struct
> lp_build_sample_context *bld,
>           /* get x0/x1/y0/y1 texels at z1 */
>           lp_build_sample_texel_soa(bld,
>                                     width_vec, height_vec, depth_vec,
> -                                   x0, y0, z1,
> +                                   x00, y00, z1,
>                                     row_stride_vec, img_stride_vec,
>                                     data_ptr, mipoffsets, neighbors1[0][0]);
>           lp_build_sample_texel_soa(bld,
>                                     width_vec, height_vec, depth_vec,
> -                                   x1, y0, z1,
> +                                   x01, y01, z1,
>                                     row_stride_vec, img_stride_vec,
>                                     data_ptr, mipoffsets, neighbors1[0][1]);
>           lp_build_sample_texel_soa(bld,
>                                     width_vec, height_vec, depth_vec,
> -                                   x0, y1, z1,
> +                                   x10, y10, z1,
>                                     row_stride_vec, img_stride_vec,
>                                     data_ptr, mipoffsets, neighbors1[1][0]);
>           lp_build_sample_texel_soa(bld,
>                                     width_vec, height_vec, depth_vec,
> -                                   x1, y1, z1,
> +                                   x11, y11, z1,
>                                     row_stride_vec, img_stride_vec,
>                                     data_ptr, mipoffsets, neighbors1[1][1]);
>  
> @@ -2306,15 +2473,25 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
>              use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_r);
>           }
>        }
> +      if (static_texture_state->target == PIPE_TEXTURE_CUBE &&
> +          derived_sampler_state.seamless_cube_map &&
> +          (derived_sampler_state.min_img_filter == PIPE_TEX_FILTER_LINEAR ||
> +           derived_sampler_state.mag_img_filter == PIPE_TEX_FILTER_LINEAR))
> {
> +         /* theoretically possible with AoS filtering but not implemented
> (complex!) */
> +         use_aos = 0;
> +      }
>  
>        if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
>            !use_aos && util_format_fits_8unorm(bld.format_desc)) {
>           debug_printf("%s: using floating point linear filtering for %s\n",
>                        __FUNCTION__, bld.format_desc->short_name);
> -         debug_printf("  min_img %d  mag_img %d  mip %d  wraps %d  wrapt %d
> wrapr %d\n",
> +         debug_printf("  min_img %d  mag_img %d  mip %d  target %d  seamless
> %d"
> +                      "  wraps %d  wrapt %d  wrapr %d\n",
>                        derived_sampler_state.min_img_filter,
>                        derived_sampler_state.mag_img_filter,
>                        derived_sampler_state.min_mip_filter,
> +                      static_texture_state->target,
> +                      derived_sampler_state.seamless_cube_map,
>                        derived_sampler_state.wrap_s,
>                        derived_sampler_state.wrap_t,
>                        derived_sampler_state.wrap_r);
> --
> 1.7.9.5
>