[Mesa-dev] [PATCH 3/3] gallivm: optimize repeat linear npot code in the aos int path

Tue Feb 18 20:06:13 PST 2014

I tested these patches out and they seem to work well enough for my test cases.

-Jeff

On Feb 14, 2014, at 8:06 PM, Roland Scheidegger <sroland at vmware.com> wrote:

> FWIW I've just cleaned 1/3 and 3/3 up a little and splitted it off into
> two patches (I really want to be able to track any changes this might
> cause separately), and on x86 sse I actually managed to shave off one
> instruction by using lp_build_iround() too :-).
> 2/3 is more of the same just for the nearest filtering path.
> In any case though I haven't actually tested any of it yet but the issue
> indeed looks very real to me. I actually need to really run some
> internal tests with this (piglit is usually not close to sensitive
> enough), the whole texture wrap mode stuff is a bit of a nightmare (as
> entirely different paths will be run depending on cpu flags AND texture
> format which makes bugs in there difficult to detect). At some point I
> wanted to unify the coord wrapping in the aos and soa paths since this
> doesn't really depend on if aos or soa filtering is used though there
> are indeed some dependencies if you want to get optimal code.
> 
> Roland
> 
> 
> Am 15.02.2014 01:54, schrieb sroland at vmware.com:
>> From: Jeff Muizelaar <jmuizelaar at mozilla.com>
>> 
>> Similar to the other cases, shift some weight/coord calculations to int
>> space. This should be slightly faster (on x86 sse it should actually safe one
>> instruction, and generally int instructions are cheaper).
>> ---
>> src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c |   74 +++++++++++++++++----
>> 1 file changed, 62 insertions(+), 12 deletions(-)
>> 
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
>> index 03a2ed5..e9f8611 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
>> @@ -194,6 +194,62 @@ lp_build_sample_wrap_nearest_float(struct lp_build_sample_context *bld,
>> 
>> 
>> /**
>> + * Helper to compute the first coord and the weight for
>> + * linear wrap repeat npot textures
>> + */
>> +static void
>> +lp_build_coord_repeat_npot_linear_int(struct lp_build_sample_context *bld,
>> +                                      LLVMValueRef coord_f,
>> +                                      LLVMValueRef length_i,
>> +                                      LLVMValueRef length_f,
>> +                                      LLVMValueRef *coord0_i,
>> +                                      LLVMValueRef *weight_i)
>> +{
>> +   struct lp_build_context *coord_bld = &bld->coord_bld;
>> +   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
>> +   struct lp_build_context abs_coord_bld;
>> +   struct lp_type abs_type;
>> +   LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
>> +                                                int_coord_bld->one);
>> +   LLVMValueRef mask, i32_c8, i32_c128, i32_c255;
>> +
>> +   /* wrap with normalized floats is just fract */
>> +   coord_f = lp_build_fract(coord_bld, coord_f);
>> +   /* mul by size */
>> +   coord_f = lp_build_mul(coord_bld, coord_f, length_f);
>> +   /* convert to int, compute lerp weight */
>> +   coord_f = lp_build_mul_imm(&bld->coord_bld, coord_f, 256);
>> +
>> +   /* At this point we don't have any negative numbers so use non-signed
>> +    * build context which might help on some archs.
>> +    */
>> +   abs_type = coord_bld->type;
>> +   abs_type.sign = 0;
>> +   lp_build_context_init(&abs_coord_bld, bld->gallivm, abs_type);
>> +   *coord0_i = lp_build_iround(&abs_coord_bld, coord_f);
>> +
>> +   /* subtract 0.5 (add -128) */
>> +   i32_c128 = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, -128);
>> +   *coord0_i = LLVMBuildAdd(bld->gallivm->builder, *coord0_i, i32_c128, "");
>> +
>> +   /* compute fractional part (AND with 0xff) */
>> +   i32_c255 = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 255);
>> +   *weight_i = LLVMBuildAnd(bld->gallivm->builder, *coord0_i, i32_c255, "");
>> +
>> +   /* compute floor (shift right 8) */
>> +   i32_c8 = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 8);
>> +   *coord0_i = LLVMBuildAShr(bld->gallivm->builder, *coord0_i, i32_c8, "");
>> +   /*
>> +    * we avoided the 0.5/length division before the repeat wrap,
>> +    * now need to fix up edge cases with selects
>> +    */
>> +   mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
>> +                           PIPE_FUNC_LESS, *coord0_i, int_coord_bld->zero);
>> +   *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
>> +}
>> +
>> +
>> +/**
>>  * Build LLVM code for texture coord wrapping, for linear filtering,
>>  * for scaled integer texcoords.
>>  * \param block_length  is the length of the pixel block along the
>> @@ -251,24 +307,21 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
>>          }
>>          else {
>>             LLVMValueRef mask;
>> -            LLVMValueRef weight;
>>             LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
>>             if (offset) {
>>                offset = lp_build_int_to_float(&bld->coord_bld, offset);
>>                offset = lp_build_div(&bld->coord_bld, offset, length_f);
>>                coord_f = lp_build_add(&bld->coord_bld, coord_f, offset);
>>             }
>> -            lp_build_coord_repeat_npot_linear(bld, coord_f,
>> -                                              length, length_f,
>> -                                              &coord0, &weight);
>> +            lp_build_coord_repeat_npot_linear_int(bld, coord_f,
>> +                                                  length, length_f,
>> +                                                  &coord0, weight_i);
>>             mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
>>                                     PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
>>             coord1 = LLVMBuildAnd(builder,
>>                                   lp_build_add(int_coord_bld, coord0,
>>                                                int_coord_bld->one),
>>                                   mask, "");
>> -            weight = lp_build_mul_imm(&bld->coord_bld, weight, 256);
>> -            *weight_i = lp_build_itrunc(&bld->coord_bld, weight);
>>          }
>>          break;
>> 
>> @@ -308,18 +361,15 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
>>          coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
>>       }
>>       else {
>> -         LLVMValueRef weight;
>>          LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
>>          if (offset) {
>>             offset = lp_build_int_to_float(&bld->coord_bld, offset);
>>             offset = lp_build_div(&bld->coord_bld, offset, length_f);
>>             coord_f = lp_build_add(&bld->coord_bld, coord_f, offset);
>>          }
>> -         lp_build_coord_repeat_npot_linear(bld, coord_f,
>> -                                           length, length_f,
>> -                                           &coord0, &weight);
>> -         weight = lp_build_mul_imm(&bld->coord_bld, weight, 256);
>> -         *weight_i = lp_build_itrunc(&bld->coord_bld, weight);
>> +         lp_build_coord_repeat_npot_linear_int(bld, coord_f,
>> +                                               length, length_f,
>> +                                               &coord0, weight_i);
>>       }
>> 
>>       mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
>>