[Mesa-dev] [PATCH] llvmpipe: reduce alignment requirement for resources from 64x64 to 4x4

Roland Scheidegger sroland at vmware.com
Tue May 28 18:42:28 PDT 2013


FWIW this change is built on top of "llvmpipe: Remove x/y from cmd_bin",
which was discussed on the list but hasn't been commited yet.

Roland

Am 29.05.2013 03:41, schrieb sroland at vmware.com:
> From: Roland Scheidegger <sroland at vmware.com>
> 
> The overallocation was very bad especially for things like 1d array
> textures which got blown up by a factor of 64. (Even ordinary smallish
> 2d textures benefit a lot from this, a mipmapped 64x64 rgba8 texture
> previously used 7*16kB = 112kB instead of now ~22kB.)
> 4x4 is chosen because this is the size the jit functions run on, so
> making it smaller is going to be a bit more complicated.
> It is actually not strictly 4x4 pixel, since we'd want to avoid situations
> where different threads are rendering to the same cacheline so we keep
> cacheline size alignment in x direction (often 64bytes).
> To make this work introduce new task width/height parameters and make
> sure clears don't clear the whole tile if it's a partial tile. Likewise,
> the rasterizer may produce fragments outside the 4x4 blocks present in a
> tile, so don't call the jit function for them.
> This does not yet fix rendering to buffers (which cannot have any y
> alignment at all), and 1d/1d array textures are still overallocated by a
> factor of 4.
> ---
>  src/gallium/drivers/llvmpipe/lp_rast.c      |   56 ++++++++++++++++-----------
>  src/gallium/drivers/llvmpipe/lp_rast_priv.h |   37 +++++++++++-------
>  src/gallium/drivers/llvmpipe/lp_scene.c     |    2 +
>  src/gallium/drivers/llvmpipe/lp_scene.h     |    4 ++
>  src/gallium/drivers/llvmpipe/lp_setup.c     |    3 +-
>  src/gallium/drivers/llvmpipe/lp_texture.c   |   26 ++++++-------
>  6 files changed, 75 insertions(+), 53 deletions(-)
> 
> diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
> index 5c837a0..be5a286 100644
> --- a/src/gallium/drivers/llvmpipe/lp_rast.c
> +++ b/src/gallium/drivers/llvmpipe/lp_rast.c
> @@ -95,6 +95,10 @@ lp_rast_tile_begin(struct lp_rasterizer_task *task,
>     task->bin = bin;
>     task->x = x * TILE_SIZE;
>     task->y = y * TILE_SIZE;
> +   task->width = TILE_SIZE + x * TILE_SIZE > task->scene->width_aligned ?
> +                    task->scene->width_aligned - x * TILE_SIZE : TILE_SIZE;
> +   task->height = TILE_SIZE + y * TILE_SIZE > task->scene->height_aligned ?
> +                    task->scene->height_aligned - y * TILE_SIZE : TILE_SIZE;
>  
>     /* reset pointers to color and depth tile(s) */
>     memset(task->color_tiles, 0, sizeof(task->color_tiles));
> @@ -144,8 +148,8 @@ lp_rast_clear_color(struct lp_rasterizer_task *task,
>                             scene->cbufs[i].stride,
>                             task->x,
>                             task->y,
> -                           TILE_SIZE,
> -                           TILE_SIZE,
> +                           task->width,
> +                           task->height,
>                             &uc);
>           }
>        }
> @@ -172,8 +176,8 @@ lp_rast_clear_color(struct lp_rasterizer_task *task,
>                             scene->cbufs[i].stride,
>                             task->x,
>                             task->y,
> -                           TILE_SIZE,
> -                           TILE_SIZE,
> +                           task->width,
> +                           task->height,
>                             &uc);
>           }
>        }
> @@ -198,8 +202,8 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
>     uint64_t clear_mask64 = arg.clear_zstencil.mask;
>     uint32_t clear_value = (uint32_t) clear_value64;
>     uint32_t clear_mask = (uint32_t) clear_mask64;
> -   const unsigned height = TILE_SIZE;
> -   const unsigned width = TILE_SIZE;
> +   const unsigned height = task->height;
> +   const unsigned width = task->width;
>     const unsigned block_size = scene->zsbuf.blocksize;
>     const unsigned dst_stride = scene->zsbuf.stride;
>     uint8_t *dst;
> @@ -325,8 +329,8 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task,
>     variant = state->variant;
>  
>     /* render the whole 64x64 tile in 4x4 chunks */
> -   for (y = 0; y < TILE_SIZE; y += 4){
> -      for (x = 0; x < TILE_SIZE; x += 4) {
> +   for (y = 0; y < task->height; y += 4){
> +      for (x = 0; x < task->width; x += 4) {
>           uint8_t *color[PIPE_MAX_COLOR_BUFS];
>           unsigned stride[PIPE_MAX_COLOR_BUFS];
>           uint8_t *depth = NULL;
> @@ -434,21 +438,27 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
>  
>     assert(lp_check_alignment(state->jit_context.u8_blend_color, 16));
>  
> -   /* run shader on 4x4 block */
> -   BEGIN_JIT_CALL(state, task);
> -   variant->jit_function[RAST_EDGE_TEST](&state->jit_context,
> -                                         x, y,
> -                                         inputs->frontfacing,
> -                                         GET_A0(inputs),
> -                                         GET_DADX(inputs),
> -                                         GET_DADY(inputs),
> -                                         color,
> -                                         depth,
> -                                         mask,
> -                                         &task->thread_data,
> -                                         stride,
> -                                         depth_stride);
> -   END_JIT_CALL();
> +   /*
> +    * The rasterizer may produce fragments outside our
> +    * allocated 4x4 blocks hence need to filter them out here.
> +    */
> +   if ((x % TILE_SIZE) < task->width && (y % TILE_SIZE) < task->height) {
> +      /* run shader on 4x4 block */
> +      BEGIN_JIT_CALL(state, task);
> +      variant->jit_function[RAST_EDGE_TEST](&state->jit_context,
> +                                            x, y,
> +                                            inputs->frontfacing,
> +                                            GET_A0(inputs),
> +                                            GET_DADX(inputs),
> +                                            GET_DADY(inputs),
> +                                            color,
> +                                            depth,
> +                                            mask,
> +                                            &task->thread_data,
> +                                            stride,
> +                                            depth_stride);
> +      END_JIT_CALL();
> +   }
>  }
>  
>  
> diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
> index e4b6e5b..4876d74 100644
> --- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
> +++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
> @@ -86,6 +86,7 @@ struct lp_rasterizer_task
>  
>     struct lp_scene *scene;
>     unsigned x, y;          /**< Pos of this tile in framebuffer, in pixels */
> +   unsigned width, height; /**< width, height of current tile, in pixels */
>  
>     uint8_t *color_tiles[PIPE_MAX_COLOR_BUFS];
>     uint8_t *depth_tile;
> @@ -293,21 +294,27 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
>        depth_stride = scene->zsbuf.stride;
>     }
>  
> -   /* run shader on 4x4 block */
> -   BEGIN_JIT_CALL(state, task);
> -   variant->jit_function[RAST_WHOLE]( &state->jit_context,
> -                                      x, y,
> -                                      inputs->frontfacing,
> -                                      GET_A0(inputs),
> -                                      GET_DADX(inputs),
> -                                      GET_DADY(inputs),
> -                                      color,
> -                                      depth,
> -                                      0xffff,
> -                                      &task->thread_data,
> -                                      stride,
> -                                      depth_stride);
> -   END_JIT_CALL();
> +   /*
> +    * The rasterizer may produce fragments outside our
> +    * allocated 4x4 blocks hence need to filter them out here.
> +    */
> +   if ((x % TILE_SIZE) < task->width && (y % TILE_SIZE) < task->height) {
> +      /* run shader on 4x4 block */
> +      BEGIN_JIT_CALL(state, task);
> +      variant->jit_function[RAST_WHOLE]( &state->jit_context,
> +                                         x, y,
> +                                         inputs->frontfacing,
> +                                         GET_A0(inputs),
> +                                         GET_DADX(inputs),
> +                                         GET_DADY(inputs),
> +                                         color,
> +                                         depth,
> +                                         0xffff,
> +                                         &task->thread_data,
> +                                         stride,
> +                                         depth_stride);
> +      END_JIT_CALL();
> +   }
>  }
>  
>  void lp_rast_triangle_1( struct lp_rasterizer_task *, 
> diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c b/src/gallium/drivers/llvmpipe/lp_scene.c
> index 771ad08..1d0dbdf 100644
> --- a/src/gallium/drivers/llvmpipe/lp_scene.c
> +++ b/src/gallium/drivers/llvmpipe/lp_scene.c
> @@ -505,6 +505,8 @@ void lp_scene_begin_binning( struct lp_scene *scene,
>  
>     scene->tiles_x = align(fb->width, TILE_SIZE) / TILE_SIZE;
>     scene->tiles_y = align(fb->height, TILE_SIZE) / TILE_SIZE;
> +   scene->width_aligned = align(fb->width, 4);
> +   scene->height_aligned = align(fb->height, 4);
>  
>     assert(scene->tiles_x <= TILES_X);
>     assert(scene->tiles_y <= TILES_Y);
> diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h
> index fa5bbca..bc6c448 100644
> --- a/src/gallium/drivers/llvmpipe/lp_scene.h
> +++ b/src/gallium/drivers/llvmpipe/lp_scene.h
> @@ -144,6 +144,10 @@ struct lp_scene {
>     /** list of resources referenced by the scene commands */
>     struct resource_ref *resources;
>  
> +   /** aligned scene width, height */
> +   unsigned width_aligned;
> +   unsigned height_aligned;
> +
>     /** Total memory used by the scene (in bytes).  This sums all the
>      * data blocks and counts all bins, state, resource references and
>      * other random allocations within the scene.
> diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
> index eb39b1c..8f2baf0 100644
> --- a/src/gallium/drivers/llvmpipe/lp_setup.c
> +++ b/src/gallium/drivers/llvmpipe/lp_setup.c
> @@ -691,8 +691,7 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
>                 assert(last_level <= res->last_level);
>  
>                 /*
> -                * The complexity here is only necessary for depth textures which
> -                * still are tiled.
> +                * The complexity here should no longer be necessary.
>                  */
>                 mip_ptr = llvmpipe_get_texture_image_all(lp_tex, first_level,
>                                                          LP_TEX_USAGE_READ);
> diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
> index 0ac3528..8e9ebdf 100644
> --- a/src/gallium/drivers/llvmpipe/lp_texture.c
> +++ b/src/gallium/drivers/llvmpipe/lp_texture.c
> @@ -84,15 +84,15 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
>        {
>           unsigned alignment, nblocksx, nblocksy, block_size;
>  
> -         /* For non-compressed formats we need to align the texture size
> -          * to the tile size to facilitate render-to-texture.
> -          * XXX this blows up 1d/1d array textures by unreasonable
> -          * amount (factor 64), probably should do something about it.
> +         /* For non-compressed formats we need 4x4 pixel alignment
> +          * (for now). We also want cache line size in x direction,
> +          * otherwise same cache line could end up in multiple threads.
> +          * XXX this blows up 1d/1d array textures by a factor of 4.
>            */
>           if (util_format_is_compressed(pt->format))
>              alignment = 1;
>           else
> -            alignment = TILE_SIZE;
> +            alignment = 4;
>  
>           nblocksx = util_format_get_nblocksx(pt->format,
>                                               align(width, alignment));
> @@ -100,7 +100,10 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
>                                               align(height, alignment));
>           block_size = util_format_get_blocksize(pt->format);
>  
> -         lpr->row_stride[level] = align(nblocksx * block_size, 16);
> +         if (util_format_is_compressed(pt->format))
> +            lpr->row_stride[level] = nblocksx * block_size;
> +         else
> +            lpr->row_stride[level] = align(nblocksx * block_size, util_cpu_caps.cacheline);
>  
>           /* if row_stride * height > LP_MAX_TEXTURE_SIZE */
>           if (lpr->row_stride[level] > LP_MAX_TEXTURE_SIZE / nblocksy) {
> @@ -244,7 +247,8 @@ llvmpipe_resource_create(struct pipe_screen *_screen,
>        assert(templat->height0 == 1);
>        assert(templat->depth0 == 1);
>        assert(templat->last_level == 0);
> -      lpr->data = align_malloc(bytes, 16);
> +      /* align to 64 bytes (4xfloat4) in case we render to them */
> +      lpr->data = align_malloc(bytes, 64);
>        /*
>         * buffers don't really have stride but it's probably safer
>         * (for code doing same calculations for buffers and textures)
> @@ -327,7 +331,6 @@ llvmpipe_resource_map(struct pipe_resource *resource,
>        struct llvmpipe_screen *screen = llvmpipe_screen(resource->screen);
>        struct sw_winsys *winsys = screen->winsys;
>        unsigned dt_usage;
> -      uint8_t *map2;
>  
>        if (tex_usage == LP_TEX_USAGE_READ) {
>           dt_usage = PIPE_TRANSFER_READ;
> @@ -345,14 +348,11 @@ llvmpipe_resource_map(struct pipe_resource *resource,
>        /* install this linear image in texture data structure */
>        lpr->linear_img.data = map;
>  
> -      /* make sure tiled data gets converted to linear data */
> -      map2 = llvmpipe_get_texture_image(lpr, 0, 0, tex_usage);
> -      return map2;
> +      return map;
>     }
>     else if (llvmpipe_resource_is_texture(resource)) {
>  
> -      map = llvmpipe_get_texture_image(lpr, layer, level,
> -                                       tex_usage);
> +      map = llvmpipe_get_texture_image(lpr, layer, level, tex_usage);
>        return map;
>     }
>     else {
> 


More information about the mesa-dev mailing list