[Mesa-dev] [PATCH] llvmpipe: reduce alignment requirement for resources from 64x64 to 4x4
Roland Scheidegger
sroland at vmware.com
Tue May 28 18:42:28 PDT 2013
FWIW this change is built on top of "llvmpipe: Remove x/y from cmd_bin",
which was discussed on the list but hasn't been commited yet.
Roland
Am 29.05.2013 03:41, schrieb sroland at vmware.com:
> From: Roland Scheidegger <sroland at vmware.com>
>
> The overallocation was very bad especially for things like 1d array
> textures which got blown up by a factor of 64. (Even ordinary smallish
> 2d textures benefit a lot from this, a mipmapped 64x64 rgba8 texture
> previously used 7*16kB = 112kB instead of now ~22kB.)
> 4x4 is chosen because this is the size the jit functions run on, so
> making it smaller is going to be a bit more complicated.
> It is actually not strictly 4x4 pixel, since we'd want to avoid situations
> where different threads are rendering to the same cacheline so we keep
> cacheline size alignment in x direction (often 64bytes).
> To make this work introduce new task width/height parameters and make
> sure clears don't clear the whole tile if it's a partial tile. Likewise,
> the rasterizer may produce fragments outside the 4x4 blocks present in a
> tile, so don't call the jit function for them.
> This does not yet fix rendering to buffers (which cannot have any y
> alignment at all), and 1d/1d array textures are still overallocated by a
> factor of 4.
> ---
> src/gallium/drivers/llvmpipe/lp_rast.c | 56 ++++++++++++++++-----------
> src/gallium/drivers/llvmpipe/lp_rast_priv.h | 37 +++++++++++-------
> src/gallium/drivers/llvmpipe/lp_scene.c | 2 +
> src/gallium/drivers/llvmpipe/lp_scene.h | 4 ++
> src/gallium/drivers/llvmpipe/lp_setup.c | 3 +-
> src/gallium/drivers/llvmpipe/lp_texture.c | 26 ++++++-------
> 6 files changed, 75 insertions(+), 53 deletions(-)
>
> diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
> index 5c837a0..be5a286 100644
> --- a/src/gallium/drivers/llvmpipe/lp_rast.c
> +++ b/src/gallium/drivers/llvmpipe/lp_rast.c
> @@ -95,6 +95,10 @@ lp_rast_tile_begin(struct lp_rasterizer_task *task,
> task->bin = bin;
> task->x = x * TILE_SIZE;
> task->y = y * TILE_SIZE;
> + task->width = TILE_SIZE + x * TILE_SIZE > task->scene->width_aligned ?
> + task->scene->width_aligned - x * TILE_SIZE : TILE_SIZE;
> + task->height = TILE_SIZE + y * TILE_SIZE > task->scene->height_aligned ?
> + task->scene->height_aligned - y * TILE_SIZE : TILE_SIZE;
>
> /* reset pointers to color and depth tile(s) */
> memset(task->color_tiles, 0, sizeof(task->color_tiles));
> @@ -144,8 +148,8 @@ lp_rast_clear_color(struct lp_rasterizer_task *task,
> scene->cbufs[i].stride,
> task->x,
> task->y,
> - TILE_SIZE,
> - TILE_SIZE,
> + task->width,
> + task->height,
> &uc);
> }
> }
> @@ -172,8 +176,8 @@ lp_rast_clear_color(struct lp_rasterizer_task *task,
> scene->cbufs[i].stride,
> task->x,
> task->y,
> - TILE_SIZE,
> - TILE_SIZE,
> + task->width,
> + task->height,
> &uc);
> }
> }
> @@ -198,8 +202,8 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
> uint64_t clear_mask64 = arg.clear_zstencil.mask;
> uint32_t clear_value = (uint32_t) clear_value64;
> uint32_t clear_mask = (uint32_t) clear_mask64;
> - const unsigned height = TILE_SIZE;
> - const unsigned width = TILE_SIZE;
> + const unsigned height = task->height;
> + const unsigned width = task->width;
> const unsigned block_size = scene->zsbuf.blocksize;
> const unsigned dst_stride = scene->zsbuf.stride;
> uint8_t *dst;
> @@ -325,8 +329,8 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task,
> variant = state->variant;
>
> /* render the whole 64x64 tile in 4x4 chunks */
> - for (y = 0; y < TILE_SIZE; y += 4){
> - for (x = 0; x < TILE_SIZE; x += 4) {
> + for (y = 0; y < task->height; y += 4){
> + for (x = 0; x < task->width; x += 4) {
> uint8_t *color[PIPE_MAX_COLOR_BUFS];
> unsigned stride[PIPE_MAX_COLOR_BUFS];
> uint8_t *depth = NULL;
> @@ -434,21 +438,27 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
>
> assert(lp_check_alignment(state->jit_context.u8_blend_color, 16));
>
> - /* run shader on 4x4 block */
> - BEGIN_JIT_CALL(state, task);
> - variant->jit_function[RAST_EDGE_TEST](&state->jit_context,
> - x, y,
> - inputs->frontfacing,
> - GET_A0(inputs),
> - GET_DADX(inputs),
> - GET_DADY(inputs),
> - color,
> - depth,
> - mask,
> - &task->thread_data,
> - stride,
> - depth_stride);
> - END_JIT_CALL();
> + /*
> + * The rasterizer may produce fragments outside our
> + * allocated 4x4 blocks hence need to filter them out here.
> + */
> + if ((x % TILE_SIZE) < task->width && (y % TILE_SIZE) < task->height) {
> + /* run shader on 4x4 block */
> + BEGIN_JIT_CALL(state, task);
> + variant->jit_function[RAST_EDGE_TEST](&state->jit_context,
> + x, y,
> + inputs->frontfacing,
> + GET_A0(inputs),
> + GET_DADX(inputs),
> + GET_DADY(inputs),
> + color,
> + depth,
> + mask,
> + &task->thread_data,
> + stride,
> + depth_stride);
> + END_JIT_CALL();
> + }
> }
>
>
> diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
> index e4b6e5b..4876d74 100644
> --- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
> +++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
> @@ -86,6 +86,7 @@ struct lp_rasterizer_task
>
> struct lp_scene *scene;
> unsigned x, y; /**< Pos of this tile in framebuffer, in pixels */
> + unsigned width, height; /**< width, height of current tile, in pixels */
>
> uint8_t *color_tiles[PIPE_MAX_COLOR_BUFS];
> uint8_t *depth_tile;
> @@ -293,21 +294,27 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
> depth_stride = scene->zsbuf.stride;
> }
>
> - /* run shader on 4x4 block */
> - BEGIN_JIT_CALL(state, task);
> - variant->jit_function[RAST_WHOLE]( &state->jit_context,
> - x, y,
> - inputs->frontfacing,
> - GET_A0(inputs),
> - GET_DADX(inputs),
> - GET_DADY(inputs),
> - color,
> - depth,
> - 0xffff,
> - &task->thread_data,
> - stride,
> - depth_stride);
> - END_JIT_CALL();
> + /*
> + * The rasterizer may produce fragments outside our
> + * allocated 4x4 blocks hence need to filter them out here.
> + */
> + if ((x % TILE_SIZE) < task->width && (y % TILE_SIZE) < task->height) {
> + /* run shader on 4x4 block */
> + BEGIN_JIT_CALL(state, task);
> + variant->jit_function[RAST_WHOLE]( &state->jit_context,
> + x, y,
> + inputs->frontfacing,
> + GET_A0(inputs),
> + GET_DADX(inputs),
> + GET_DADY(inputs),
> + color,
> + depth,
> + 0xffff,
> + &task->thread_data,
> + stride,
> + depth_stride);
> + END_JIT_CALL();
> + }
> }
>
> void lp_rast_triangle_1( struct lp_rasterizer_task *,
> diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c b/src/gallium/drivers/llvmpipe/lp_scene.c
> index 771ad08..1d0dbdf 100644
> --- a/src/gallium/drivers/llvmpipe/lp_scene.c
> +++ b/src/gallium/drivers/llvmpipe/lp_scene.c
> @@ -505,6 +505,8 @@ void lp_scene_begin_binning( struct lp_scene *scene,
>
> scene->tiles_x = align(fb->width, TILE_SIZE) / TILE_SIZE;
> scene->tiles_y = align(fb->height, TILE_SIZE) / TILE_SIZE;
> + scene->width_aligned = align(fb->width, 4);
> + scene->height_aligned = align(fb->height, 4);
>
> assert(scene->tiles_x <= TILES_X);
> assert(scene->tiles_y <= TILES_Y);
> diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h
> index fa5bbca..bc6c448 100644
> --- a/src/gallium/drivers/llvmpipe/lp_scene.h
> +++ b/src/gallium/drivers/llvmpipe/lp_scene.h
> @@ -144,6 +144,10 @@ struct lp_scene {
> /** list of resources referenced by the scene commands */
> struct resource_ref *resources;
>
> + /** aligned scene width, height */
> + unsigned width_aligned;
> + unsigned height_aligned;
> +
> /** Total memory used by the scene (in bytes). This sums all the
> * data blocks and counts all bins, state, resource references and
> * other random allocations within the scene.
> diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
> index eb39b1c..8f2baf0 100644
> --- a/src/gallium/drivers/llvmpipe/lp_setup.c
> +++ b/src/gallium/drivers/llvmpipe/lp_setup.c
> @@ -691,8 +691,7 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
> assert(last_level <= res->last_level);
>
> /*
> - * The complexity here is only necessary for depth textures which
> - * still are tiled.
> + * The complexity here should no longer be necessary.
> */
> mip_ptr = llvmpipe_get_texture_image_all(lp_tex, first_level,
> LP_TEX_USAGE_READ);
> diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
> index 0ac3528..8e9ebdf 100644
> --- a/src/gallium/drivers/llvmpipe/lp_texture.c
> +++ b/src/gallium/drivers/llvmpipe/lp_texture.c
> @@ -84,15 +84,15 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
> {
> unsigned alignment, nblocksx, nblocksy, block_size;
>
> - /* For non-compressed formats we need to align the texture size
> - * to the tile size to facilitate render-to-texture.
> - * XXX this blows up 1d/1d array textures by unreasonable
> - * amount (factor 64), probably should do something about it.
> + /* For non-compressed formats we need 4x4 pixel alignment
> + * (for now). We also want cache line size in x direction,
> + * otherwise same cache line could end up in multiple threads.
> + * XXX this blows up 1d/1d array textures by a factor of 4.
> */
> if (util_format_is_compressed(pt->format))
> alignment = 1;
> else
> - alignment = TILE_SIZE;
> + alignment = 4;
>
> nblocksx = util_format_get_nblocksx(pt->format,
> align(width, alignment));
> @@ -100,7 +100,10 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
> align(height, alignment));
> block_size = util_format_get_blocksize(pt->format);
>
> - lpr->row_stride[level] = align(nblocksx * block_size, 16);
> + if (util_format_is_compressed(pt->format))
> + lpr->row_stride[level] = nblocksx * block_size;
> + else
> + lpr->row_stride[level] = align(nblocksx * block_size, util_cpu_caps.cacheline);
>
> /* if row_stride * height > LP_MAX_TEXTURE_SIZE */
> if (lpr->row_stride[level] > LP_MAX_TEXTURE_SIZE / nblocksy) {
> @@ -244,7 +247,8 @@ llvmpipe_resource_create(struct pipe_screen *_screen,
> assert(templat->height0 == 1);
> assert(templat->depth0 == 1);
> assert(templat->last_level == 0);
> - lpr->data = align_malloc(bytes, 16);
> + /* align to 64 bytes (4xfloat4) in case we render to them */
> + lpr->data = align_malloc(bytes, 64);
> /*
> * buffers don't really have stride but it's probably safer
> * (for code doing same calculations for buffers and textures)
> @@ -327,7 +331,6 @@ llvmpipe_resource_map(struct pipe_resource *resource,
> struct llvmpipe_screen *screen = llvmpipe_screen(resource->screen);
> struct sw_winsys *winsys = screen->winsys;
> unsigned dt_usage;
> - uint8_t *map2;
>
> if (tex_usage == LP_TEX_USAGE_READ) {
> dt_usage = PIPE_TRANSFER_READ;
> @@ -345,14 +348,11 @@ llvmpipe_resource_map(struct pipe_resource *resource,
> /* install this linear image in texture data structure */
> lpr->linear_img.data = map;
>
> - /* make sure tiled data gets converted to linear data */
> - map2 = llvmpipe_get_texture_image(lpr, 0, 0, tex_usage);
> - return map2;
> + return map;
> }
> else if (llvmpipe_resource_is_texture(resource)) {
>
> - map = llvmpipe_get_texture_image(lpr, layer, level,
> - tex_usage);
> + map = llvmpipe_get_texture_image(lpr, layer, level, tex_usage);
> return map;
> }
> else {
>
More information about the mesa-dev
mailing list