[Mesa-dev] [PATCH 3/3] i965/miptree: Use cpu tiling/detiling when mapping

Tue Oct 23 04:52:53 UTC 2018

On 10/22/18 11:47 PM, Matt Turner wrote:
> On Mon, Sep 24, 2018 at 4:20 AM Tapani Pälli <tapani.palli at intel.com> wrote:
>>
>> From: Scott D Phillips <scott.d.phillips at intel.com>
>>
>> Rename the (un)map_gtt functions to (un)map_map (map by
>> returning a map) and add new functions (un)map_tiled_memcpy that
>> return a shadow buffer populated with the intel_tiled_memcpy
>> functions.
>>
>> Tiling/detiling with the cpu will be the only way to handle Yf/Ys
>> tiling, when support is added for those formats.
>>
>> v2: Compute extents properly in the x|y-rounded-down case (Chris Wilson)
>>
>> v3: Add units to parameter names of tile_extents (Nanley Chery)
>>      Use _mesa_align_malloc for the shadow copy (Nanley)
>>      Continue using gtt maps on gen4 (Nanley)
>>
>> v4: Use streaming_load_memcpy when detiling
>>
>> v5: (edited by Ken) Move map_tiled_memcpy above map_movntdqa, so it
>>      takes precedence.  Add intel_miptree_access_raw, needed after
>>      rebasing on commit b499b85b0f2cc0c82b7c9af91502c2814fdc8e67.
>>
>> v6: refactor to changes done for sse41 separation (Tapani)
>>
>> Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk> (v5)
>> Reviewed-by: Kenneth Graunke <kenneth at whitecape.org> (v5)
>>
>> Signed-off-by: Tapani Pälli <tapani.palli at intel.com>
>> ---
>>   src/mesa/drivers/dri/i965/intel_mipmap_tree.c | 110 +++++++++++++++++-
>>   1 file changed, 106 insertions(+), 4 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
>> index 36681352ba7..4c2cee8ebba 100644
>> --- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
>> +++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
>> @@ -31,6 +31,8 @@
>>   #include "intel_image.h"
>>   #include "intel_mipmap_tree.h"
>>   #include "intel_tex.h"
>> +#include "intel_tiled_memcpy.h"
>> +#include "intel_tiled_memcpy_sse41.h"
>>   #include "intel_blit.h"
>>   #include "intel_fbo.h"
>>
>> @@ -2998,7 +3000,7 @@ intel_miptree_unmap_raw(struct intel_mipmap_tree *mt)
>>   }
>>
>>   static void
>> -intel_miptree_unmap_gtt(struct brw_context *brw,
>> +intel_miptree_unmap_map(struct brw_context *brw,
>>                           struct intel_mipmap_tree *mt,
>>                           struct intel_miptree_map *map,
>>                           unsigned int level, unsigned int slice)
>> @@ -3007,7 +3009,7 @@ intel_miptree_unmap_gtt(struct brw_context *brw,
>>   }
>>
>>   static void
>> -intel_miptree_map_gtt(struct brw_context *brw,
>> +intel_miptree_map_map(struct brw_context *brw,
>>                        struct intel_mipmap_tree *mt,
>>                        struct intel_miptree_map *map,
>>                        unsigned int level, unsigned int slice)
>> @@ -3055,7 +3057,7 @@ intel_miptree_map_gtt(struct brw_context *brw,
>>          mt, _mesa_get_format_name(mt->format),
>>          x, y, map->ptr, map->stride);
>>
>> -   map->unmap = intel_miptree_unmap_gtt;
>> +   map->unmap = intel_miptree_unmap_map;
>>   }
>>
>>   static void
>> @@ -3087,6 +3089,101 @@ intel_miptree_unmap_blit(struct brw_context *brw,
>>      intel_miptree_release(&map->linear_mt);
>>   }
>>
>> +/* Compute extent parameters for use with tiled_memcpy functions.
>> + * xs are in units of bytes and ys are in units of strides.
>> + */
>> +static inline void
>> +tile_extents(struct intel_mipmap_tree *mt, struct intel_miptree_map *map,
>> +             unsigned int level, unsigned int slice, unsigned int *x1_B,
>> +             unsigned int *x2_B, unsigned int *y1_el, unsigned int *y2_el)
>> +{
>> +   unsigned int block_width, block_height;
>> +   unsigned int x0_el, y0_el;
>> +
>> +   _mesa_get_format_block_size(mt->format, &block_width, &block_height);
>> +
>> +   assert(map->x % block_width == 0);
>> +   assert(map->y % block_height == 0);
>> +
>> +   intel_miptree_get_image_offset(mt, level, slice, &x0_el, &y0_el);
>> +   *x1_B = (map->x / block_width + x0_el) * mt->cpp;
>> +   *y1_el = map->y / block_height + y0_el;
>> +   *x2_B = (DIV_ROUND_UP(map->x + map->w, block_width) + x0_el) * mt->cpp;
>> +   *y2_el = DIV_ROUND_UP(map->y + map->h, block_height) + y0_el;
>> +}
>> +
>> +static void
>> +intel_miptree_unmap_tiled_memcpy(struct brw_context *brw,
>> +                                 struct intel_mipmap_tree *mt,
>> +                                 struct intel_miptree_map *map,
>> +                                 unsigned int level,
>> +                                 unsigned int slice)
>> +{
>> +   if (map->mode & GL_MAP_WRITE_BIT) {
>> +      unsigned int x1, x2, y1, y2;
>> +      tile_extents(mt, map, level, slice, &x1, &x2, &y1, &y2);
>> +
>> +      char *dst = intel_miptree_map_raw(brw, mt, map->mode | MAP_RAW);
>> +      dst += mt->offset;
>> +
>> +      linear_to_tiled(x1, x2, y1, y2, dst, map->ptr, mt->surf.row_pitch,
>> +                      map->stride, brw->has_swizzling, mt->surf.tiling,
>> +                      INTEL_COPY_MEMCPY);
>> +
>> +      intel_miptree_unmap_raw(mt);
>> +   }
>> +   _mesa_align_free(map->buffer);
>> +   map->buffer = map->ptr = NULL;
>> +}
>> +
>> +static void
>> +intel_miptree_map_tiled_memcpy(struct brw_context *brw,
>> +                               struct intel_mipmap_tree *mt,
>> +                               struct intel_miptree_map *map,
>> +                               unsigned int level, unsigned int slice)
>> +{
>> +   intel_miptree_access_raw(brw, mt, level, slice,
>> +                            map->mode & GL_MAP_WRITE_BIT);
>> +
>> +   unsigned int x1, x2, y1, y2;
>> +   tile_extents(mt, map, level, slice, &x1, &x2, &y1, &y2);
>> +   map->stride = ALIGN(_mesa_format_row_stride(mt->format, map->w), 16);
>> +
>> +   /* The tiling and detiling functions require that the linear buffer
>> +    * has proper 16-byte alignment (that is, its `x0` is 16-byte
>> +    * aligned). Here we over-allocate the linear buffer by enough
>> +    * bytes to get the proper alignment.
>> +    */
>> +   map->buffer = _mesa_align_malloc(map->stride * (y2 - y1) + (x1 & 0xf), 16);
>> +   map->ptr = (char *)map->buffer + (x1 & 0xf);
>> +   assert(map->buffer);
>> +
>> +   if (!(map->mode & GL_MAP_INVALIDATE_RANGE_BIT)) {
>> +      char *src = intel_miptree_map_raw(brw, mt, map->mode | MAP_RAW);
>> +      src += mt->offset;
>> +
>> +      const tiled_to_linear_fn ttl_func =
>> +#if defined(USE_SSE41)
>> +         cpu_has_sse4_1 ? tiled_to_linear_sse41 :
>> +#endif
>> +         tiled_to_linear;
>> +
>> +      const mem_copy_fn_type copy_type =
>> +#if defined(USE_SSE41)
>> +         cpu_has_sse4_1 ? INTEL_COPY_STREAMING_LOAD :
>> +#endif
>> +         INTEL_COPY_MEMCPY;
> 
> I find this bit weird -- identical blocks of code that pick the SSE4
> vs non-SSE4 function and also INTEL_COPY_STREAMING_LOAD vs
> INTEL_COPY_MEMCPY based on the same condition.
> 
> Reviewing patches 1 and 2 I expected this mem_copy_fn_type to be used
> to select the variation of the function to call. That would be nice to
> do, but that's fine as a clean up.

Yeah, this would make sense. I'll take a look at that later. There was 
also some discussion of moving this functionality to isl so I could do 
that when moving.

> All three are
> 
> Reviewed-by: Matt Turner <mattst88 at gmail.com>

Thanks!

>> +
>> +      ttl_func(x1, x2, y1, y2, map->ptr, src, map->stride,
>> +               mt->surf.row_pitch, brw->has_swizzling, mt->surf.tiling,
>> +               copy_type);