[Mesa-dev] [PATCH 0.5/5] i965/tiled_memcpy: linear_to_ytiled a cache line at a time

Thu Jan 25 01:51:32 UTC 2018

Jason Ekstrand <jason at jlekstrand.net> writes:

> I'm a bit concerned about how complex things are getting.  I've been
> looking at this for the past hour or so and I don't have anything yet.
> I'll keep thinking.

Hmm, ya the patch is a bit repetitive. Right now the loop iteration is
over tiled address bits is like:

v4 v3 v2 u6 u5 u4 v1 v0 u3 u2 u1 u0
======== ~~~~~~~~ ===== ~~~~~~~~~~~
        inner loop ^    16-byte memcpy
outter loop        |
                4-repetition

Maybe restructuring it to follow the y-tile address swizzling more
closely will let me drop the 4-repetition, like:

u6 u5 u4 v4 v3 v2 v1 v0 u3 u2 u1 u0
~~~~~~~~ ==inner loop== ~~memcpy~~~
outter
 loop

Unless something else comes to you, I can give that a shot.

> On Tue, Jan 23, 2018 at 6:42 AM, Scott D Phillips <
> scott.d.phillips at intel.com> wrote:
>
>> TileY's low 6 address bits are: v1 v0 u3 u2 u1 u0
>> Thus a cache line in the tiled surface is composed of a 2d area of
>> 16x4 bytes of the linear surface.
>>
>> Add a special case where the area being copied is 4-line aligned
>> and a multiple of 4-lines so that entire cache lines will be
>> written at a time.
>>
>> On Apollolake, this increases tiling throughput to wc maps by
>> 83.1412% +/- 1.81446%
>> ---
>>  src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 63
>> ++++++++++++++++++++++----
>>  1 file changed, 55 insertions(+), 8 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
>> b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
>> index e2b7b3496d..e45f3fec1e 100644
>> --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
>> +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
>> @@ -287,8 +287,8 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t
>> x2, uint32_t x3,
>>   */
>>  static inline void
>>  linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
>> -                 uint32_t y0, uint32_t y1,
>> -                 char *dst, const char *src,
>> +                 uint32_t y0, uint32_t y3,
>> +                 char *dst, const char *src0,
>>                   int32_t src_pitch,
>>                   uint32_t swizzle_bit,
>>                   mem_copy_fn mem_copy,
>> @@ -306,6 +306,9 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t
>> x2, uint32_t x3,
>>     const uint32_t column_width = ytile_span;
>>     const uint32_t bytes_per_column = column_width * ytile_height;
>>
>> +   uint32_t y1 = ALIGN_UP(y0, 4);
>> +   uint32_t y2 = ALIGN_DOWN(y3, 4);
>> +
>>     uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) *
>> bytes_per_column;
>>     uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) *
>> bytes_per_column;
>>
>> @@ -319,26 +322,70 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t
>> x2, uint32_t x3,
>>
>>     uint32_t x, yo;
>>
>> -   src += (ptrdiff_t)y0 * src_pitch;
>> +   const char *src = src0 + (ptrdiff_t)y1 * src_pitch;
>>
>> -   for (yo = y0 * column_width; yo < y1 * column_width; yo +=
>> column_width) {
>> +   for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 *
>> column_width) {
>>        uint32_t xo = xo1;
>>        uint32_t swizzle = swizzle1;
>>
>> -      mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
>> +      if (x0 != x1) {
>> +         mem_copy(dst + ((xo0 + yo + 0 * column_width) ^ swizzle0), src +
>> x0 + 0 * src_pitch, x1 - x0);
>> +         mem_copy(dst + ((xo0 + yo + 1 * column_width) ^ swizzle0), src +
>> x0 + 1 * src_pitch, x1 - x0);
>> +         mem_copy(dst + ((xo0 + yo + 2 * column_width) ^ swizzle0), src +
>> x0 + 2 * src_pitch, x1 - x0);
>> +         mem_copy(dst + ((xo0 + yo + 3 * column_width) ^ swizzle0), src +
>> x0 + 3 * src_pitch, x1 - x0);
>> +      }
>>
>>        /* Step by spans/columns.  As it happens, the swizzle bit flips
>>         * at each step so we don't need to calculate it explicitly.
>>         */
>>        for (x = x1; x < x2; x += ytile_span) {
>> -         mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x,
>> ytile_span);
>> +         mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle),
>> src + x + 0 * src_pitch, ytile_span);
>> +         mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle),
>> src + x + 1 * src_pitch, ytile_span);
>> +         mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle),
>> src + x + 2 * src_pitch, ytile_span);
>> +         mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle),
>> src + x + 3 * src_pitch, ytile_span);
>>           xo += bytes_per_column;
>>           swizzle ^= swizzle_bit;
>>        }
>>
>> -      mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
>> +      if (x2 != x3) {
>> +         mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle),
>> src + x2 + 0 * src_pitch, x3 - x2);
>> +         mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle),
>> src + x2 + 1 * src_pitch, x3 - x2);
>> +         mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle),
>> src + x2 + 2 * src_pitch, x3 - x2);
>> +         mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle),
>> src + x2 + 3 * src_pitch, x3 - x2);
>> +      }
>>
>> -      src += src_pitch;
>> +      src += 4 * src_pitch;
>> +   }
>> +
>> +   if (y0 != y1 || y2 != y3) {
>> +      src = src0 + (ptrdiff_t)y0 * src_pitch;
>> +
>> +      for (yo = y0 * column_width; yo < y3 * column_width; yo +=
>> column_width) {
>> +         uint32_t xo = xo1;
>> +         uint32_t swizzle = swizzle1;
>> +
>> +         if (yo >= y1 * column_width && yo < y2 * column_width) {
>> +            if (y2 == y3)
>> +               break;
>> +            yo = y2 * column_width;
>> +            src = src0 + y2 * src_pitch;
>> +         }
>> +
>> +         mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
>> +
>> +         /* Step by spans/columns.  As it happens, the swizzle bit flips
>> +          * at each step so we don't need to calculate it explicitly.
>> +          */
>> +         for (x = x1; x < x2; x += ytile_span) {
>> +            mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x,
>> ytile_span);
>> +            xo += bytes_per_column;
>> +            swizzle ^= swizzle_bit;
>> +         }
>> +
>> +         mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
>> +
>> +         src += src_pitch;
>> +      }
>>     }
>>  }
>>
>> --
>> 2.16.1
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev