[Mesa-dev] [PATCH 10/10] gallium/radeon: remove the internal u_upload_mgr pointer
Edmondo Tommasina
edmondo.tommasina at gmail.com
Sun Jan 29 16:10:49 UTC 2017
For the series:
Tested-by: Edmondo Tommasina <edmondo.tommasina at gmail.com>
The Talos Principle benchmark seems to be about 0.5% faster.
Thanks
edmondo
On Fri, Jan 27, 2017 at 12:02 PM, Marek Olšák <maraeo at gmail.com> wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> also remove the BIND flags
> ---
> src/gallium/drivers/r600/r600_state_common.c | 14 +++++++++-----
> src/gallium/drivers/radeon/r600_buffer_common.c | 6 ++++--
> src/gallium/drivers/radeon/r600_pipe_common.c | 15 +++++++--------
> src/gallium/drivers/radeon/r600_pipe_common.h | 1 -
> src/gallium/drivers/radeonsi/si_compute.c | 8 ++++----
> src/gallium/drivers/radeonsi/si_descriptors.c | 7 ++++---
> src/gallium/drivers/radeonsi/si_state_draw.c | 6 ++++--
> 7 files changed, 32 insertions(+), 25 deletions(-)
>
> diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
> index c8502e9..9ff2364 100644
> --- a/src/gallium/drivers/r600/r600_state_common.c
> +++ b/src/gallium/drivers/r600/r600_state_common.c
> @@ -1086,24 +1086,27 @@ static void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint
>
> if (!(tmpPtr = malloc(size))) {
> R600_ERR("Failed to allocate BE swap buffer.\n");
> return;
> }
>
> for (i = 0; i < size / 4; ++i) {
> tmpPtr[i] = util_cpu_to_le32(((uint32_t *)ptr)[i]);
> }
>
> - u_upload_data(rctx->b.uploader, 0, size, 256, tmpPtr, &cb->buffer_offset, &cb->buffer);
> + u_upload_data(ctx->stream_uploader, 0, size, 256,
> + tmpPtr, &cb->buffer_offset, &cb->buffer);
> free(tmpPtr);
> } else {
> - u_upload_data(rctx->b.uploader, 0, input->buffer_size, 256, ptr, &cb->buffer_offset, &cb->buffer);
> + u_upload_data(ctx->stream_uploader, 0,
> + input->buffer_size, 256, ptr,
> + &cb->buffer_offset, &cb->buffer);
> }
> /* account it in gtt */
> rctx->b.gtt += input->buffer_size;
> } else {
> /* Setup the hw buffer. */
> cb->buffer_offset = input->buffer_offset;
> pipe_resource_reference(&cb->buffer, input->buffer);
> r600_context_add_resource_size(ctx, input->buffer);
> }
>
> @@ -1729,42 +1732,43 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
> data += info.indirect_offset / sizeof(unsigned);
> start = data[2] * ib.index_size;
> count = data[0];
> }
> else {
> start = 0;
> count = 0;
> }
> }
>
> - u_upload_alloc(rctx->b.uploader, start, count * 2, 256,
> - &out_offset, &out_buffer, &ptr);
> + u_upload_alloc(ctx->stream_uploader, start, count * 2,
> + 256, &out_offset, &out_buffer, &ptr);
>
> util_shorten_ubyte_elts_to_userptr(
> &rctx->b.b, &ib, 0, ib.offset + start, count, ptr);
>
> pipe_resource_reference(&ib.buffer, NULL);
> ib.user_buffer = NULL;
> ib.buffer = out_buffer;
> ib.offset = out_offset;
> ib.index_size = 2;
> }
>
> /* Upload the index buffer.
> * The upload is skipped for small index counts on little-endian machines
> * and the indices are emitted via PKT3_DRAW_INDEX_IMMD.
> * Indirect draws never use immediate indices.
> * Note: Instanced rendering in combination with immediate indices hangs. */
> if (ib.user_buffer && (R600_BIG_ENDIAN || info.indirect ||
> info.instance_count > 1 ||
> info.count*ib.index_size > 20)) {
> - u_upload_data(rctx->b.uploader, 0, info.count * ib.index_size, 256,
> + u_upload_data(ctx->stream_uploader, 0,
> + info.count * ib.index_size, 256,
> ib.user_buffer, &ib.offset, &ib.buffer);
> ib.user_buffer = NULL;
> }
> } else {
> info.index_bias = info.start;
> }
>
> /* Set the index offset and primitive restart. */
> if (rctx->vgt_state.vgt_multi_prim_ib_reset_en != info.primitive_restart ||
> rctx->vgt_state.vgt_multi_prim_ib_reset_indx != info.restart_index ||
> diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
> index da6f020..40a644b 100644
> --- a/src/gallium/drivers/radeon/r600_buffer_common.c
> +++ b/src/gallium/drivers/radeon/r600_buffer_common.c
> @@ -361,22 +361,24 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
> r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) {
> assert(usage & PIPE_TRANSFER_WRITE);
>
> /* Check if mapping this buffer would cause waiting for the GPU. */
> if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
> !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
> /* Do a wait-free write-only transfer using a temporary buffer. */
> unsigned offset;
> struct r600_resource *staging = NULL;
>
> - u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT),
> - 256, &offset, (struct pipe_resource**)&staging, (void**)&data);
> + u_upload_alloc(ctx->stream_uploader, 0,
> + box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT),
> + 256, &offset, (struct pipe_resource**)&staging,
> + (void**)&data);
>
> if (staging) {
> data += box->x % R600_MAP_BUFFER_ALIGNMENT;
> return r600_buffer_get_transfer(ctx, resource, level, usage, box,
> ptransfer, data, staging, offset);
> }
> } else {
> /* At this point, the buffer is always idle (we checked it above). */
> usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
> }
> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
> index 1da7228..a8cba9d 100644
> --- a/src/gallium/drivers/radeon/r600_pipe_common.c
> +++ b/src/gallium/drivers/radeon/r600_pipe_common.c
> @@ -186,21 +186,22 @@ void r600_draw_rectangle(struct blitter_context *blitter,
> viewport.scale[1] = 1.0f;
> viewport.scale[2] = 1.0f;
> viewport.translate[0] = 0.0f;
> viewport.translate[1] = 0.0f;
> viewport.translate[2] = 0.0f;
> rctx->b.set_viewport_states(&rctx->b, 0, 1, &viewport);
>
> /* Upload vertices. The hw rectangle has only 3 vertices,
> * I guess the 4th one is derived from the first 3.
> * The vertex specification should match u_blitter's vertex element state. */
> - u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, 256, &offset, &buf, (void**)&vb);
> + u_upload_alloc(rctx->b.stream_uploader, 0, sizeof(float) * 24, 256,
> + &offset, &buf, (void**)&vb);
> if (!buf)
> return;
>
> vb[0] = x1;
> vb[1] = y1;
> vb[2] = depth;
> vb[3] = 1;
>
> vb[8] = x1;
> vb[9] = y2;
> @@ -594,26 +595,24 @@ bool r600_common_context_init(struct r600_common_context *rctx,
> r600_streamout_init(rctx);
> r600_query_init(rctx);
> cayman_init_msaa(&rctx->b);
>
> rctx->allocator_zeroed_memory =
> u_suballocator_create(&rctx->b, rscreen->info.gart_page_size,
> 0, PIPE_USAGE_DEFAULT, true);
> if (!rctx->allocator_zeroed_memory)
> return false;
>
> - rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024,
> - PIPE_BIND_INDEX_BUFFER |
> - PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM);
> - if (!rctx->uploader)
> + rctx->b.stream_uploader = u_upload_create(&rctx->b, 1024 * 1024,
> + 0, PIPE_USAGE_STREAM);
> + if (!rctx->b.stream_uploader)
> return false;
> - rctx->b.stream_uploader = rctx->uploader;
>
> rctx->ctx = rctx->ws->ctx_create(rctx->ws);
> if (!rctx->ctx)
> return false;
>
> if (rscreen->info.has_sdma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) {
> rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
> r600_flush_dma_ring,
> rctx);
> rctx->dma.flush = r600_flush_dma_ring;
> @@ -641,22 +640,22 @@ void r600_common_context_cleanup(struct r600_common_context *rctx)
> if (rctx->query_result_shader)
> rctx->b.delete_compute_state(&rctx->b, rctx->query_result_shader);
>
> if (rctx->gfx.cs)
> rctx->ws->cs_destroy(rctx->gfx.cs);
> if (rctx->dma.cs)
> rctx->ws->cs_destroy(rctx->dma.cs);
> if (rctx->ctx)
> rctx->ws->ctx_destroy(rctx->ctx);
>
> - if (rctx->uploader) {
> - u_upload_destroy(rctx->uploader);
> + if (rctx->b.stream_uploader) {
> + u_upload_destroy(rctx->b.stream_uploader);
> }
>
> slab_destroy_child(&rctx->pool_transfers);
>
> if (rctx->allocator_zeroed_memory) {
> u_suballocator_destroy(rctx->allocator_zeroed_memory);
> }
> rctx->ws->fence_reference(&rctx->last_gfx_fence, NULL);
> rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL);
> }
> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
> index fafe6c1..f228755 100644
> --- a/src/gallium/drivers/radeon/r600_pipe_common.h
> +++ b/src/gallium/drivers/radeon/r600_pipe_common.h
> @@ -547,21 +547,20 @@ struct r600_common_context {
> struct r600_ring gfx;
> struct r600_ring dma;
> struct pipe_fence_handle *last_gfx_fence;
> struct pipe_fence_handle *last_sdma_fence;
> unsigned num_gfx_cs_flushes;
> unsigned initial_gfx_cs_size;
> unsigned gpu_reset_counter;
> unsigned last_dirty_tex_counter;
> unsigned last_compressed_colortex_counter;
>
> - struct u_upload_mgr *uploader;
> struct u_suballocator *allocator_zeroed_memory;
> struct slab_child_pool pool_transfers;
>
> /* Current unaccounted memory usage. */
> uint64_t vram;
> uint64_t gtt;
>
> /* States. */
> struct r600_streamout streamout;
> struct r600_scissors scissors;
> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
> index d05c488..aae651c 100644
> --- a/src/gallium/drivers/radeonsi/si_compute.c
> +++ b/src/gallium/drivers/radeonsi/si_compute.c
> @@ -496,23 +496,23 @@ static void si_setup_user_sgprs_co_v2(struct si_context *sctx,
>
> dispatch.grid_size_x = info->grid[0] * info->block[0];
> dispatch.grid_size_y = info->grid[1] * info->block[1];
> dispatch.grid_size_z = info->grid[2] * info->block[2];
>
> dispatch.private_segment_size = program->private_size;
> dispatch.group_segment_size = program->local_size;
>
> dispatch.kernarg_address = kernel_args_va;
>
> - u_upload_data(sctx->b.uploader, 0, sizeof(dispatch), 256,
> - &dispatch, &dispatch_offset,
> - (struct pipe_resource**)&dispatch_buf);
> + u_upload_data(sctx->b.b.stream_uploader, 0, sizeof(dispatch),
> + 256, &dispatch, &dispatch_offset,
> + (struct pipe_resource**)&dispatch_buf);
>
> if (!dispatch_buf) {
> fprintf(stderr, "Error: Failed to allocate dispatch "
> "packet.");
> }
> radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, dispatch_buf,
> RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
>
> dispatch_va = dispatch_buf->gpu_address + dispatch_offset;
>
> @@ -558,21 +558,21 @@ static void si_upload_compute_input(struct si_context *sctx,
> unsigned num_work_size_bytes = program->use_code_object_v2 ? 0 : 36;
> uint32_t kernel_args_offset = 0;
> uint32_t *kernel_args;
> void *kernel_args_ptr;
> uint64_t kernel_args_va;
> unsigned i;
>
> /* The extra num_work_size_bytes are for work group / work item size information */
> kernel_args_size = program->input_size + num_work_size_bytes;
>
> - u_upload_alloc(sctx->b.uploader, 0, kernel_args_size, 256,
> + u_upload_alloc(sctx->b.b.stream_uploader, 0, kernel_args_size, 256,
> &kernel_args_offset,
> (struct pipe_resource**)&input_buffer, &kernel_args_ptr);
>
> kernel_args = (uint32_t*)kernel_args_ptr;
> kernel_args_va = input_buffer->gpu_address + kernel_args_offset;
>
> if (!code_object) {
> for (i = 0; i < 3; i++) {
> kernel_args[i] = info->grid[i];
> kernel_args[i + 3] = info->grid[i] * info->block[i];
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
> index 4c1120a..4a5407a 100644
> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
> @@ -227,21 +227,21 @@ static bool si_upload_descriptors(struct si_context *sctx,
> radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4);
> radeon_emit_array(sctx->ce_ib, list + begin, count);
> }
>
> if (!si_ce_upload(sctx, desc->ce_offset, list_size,
> &desc->buffer_offset, &desc->buffer))
> return false;
> } else {
> void *ptr;
>
> - u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
> + u_upload_alloc(sctx->b.b.stream_uploader, 0, list_size, 256,
> &desc->buffer_offset,
> (struct pipe_resource**)&desc->buffer, &ptr);
> if (!desc->buffer)
> return false; /* skip the draw call */
>
> util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
> desc->gpu_list = ptr;
>
> radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
> RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
> @@ -953,21 +953,22 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
> if (!sctx->vertex_buffers_dirty || !count || !velems)
> return true;
>
> unsigned fix_size3 = velems->fix_size3;
> unsigned first_vb_use_mask = velems->first_vb_use_mask;
>
> /* Vertex buffer descriptors are the only ones which are uploaded
> * directly through a staging buffer and don't go through
> * the fine-grained upload path.
> */
> - u_upload_alloc(sctx->b.uploader, 0, count * 16, 256, &desc->buffer_offset,
> + u_upload_alloc(sctx->b.b.stream_uploader, 0, count * 16, 256,
> + &desc->buffer_offset,
> (struct pipe_resource**)&desc->buffer, (void**)&ptr);
> if (!desc->buffer)
> return false;
>
> radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
> desc->buffer, RADEON_USAGE_READ,
> RADEON_PRIO_DESCRIPTORS);
>
> assert(count <= SI_NUM_VERTEX_BUFFERS);
>
> @@ -1059,21 +1060,21 @@ static struct si_descriptors *
> si_const_buffer_descriptors(struct si_context *sctx, unsigned shader)
> {
> return &sctx->descriptors[si_const_buffer_descriptors_idx(shader)];
> }
>
> void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
> const uint8_t *ptr, unsigned size, uint32_t *const_offset)
> {
> void *tmp;
>
> - u_upload_alloc(sctx->b.uploader, 0, size, 256, const_offset,
> + u_upload_alloc(sctx->b.b.stream_uploader, 0, size, 256, const_offset,
> (struct pipe_resource**)rbuffer, &tmp);
> if (*rbuffer)
> util_memcpy_cpu_to_le32(tmp, ptr, size);
> }
>
> static void si_set_constant_buffer(struct si_context *sctx,
> struct si_buffer_resources *buffers,
> unsigned descriptors_idx,
> uint slot, const struct pipe_constant_buffer *input)
> {
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index 1dd6dcc..01340af 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -1049,21 +1049,22 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
> /* Translate or upload, if needed. */
> /* 8-bit indices are supported on VI. */
> if (sctx->b.chip_class <= CIK && ib.index_size == 1) {
> struct pipe_resource *out_buffer = NULL;
> unsigned out_offset, start, count, start_offset;
> void *ptr;
>
> si_get_draw_start_count(sctx, info, &start, &count);
> start_offset = start * ib.index_size;
>
> - u_upload_alloc(sctx->b.uploader, start_offset, count * 2, 256,
> + u_upload_alloc(ctx->stream_uploader, start_offset,
> + count * 2, 256,
> &out_offset, &out_buffer, &ptr);
> if (!out_buffer) {
> pipe_resource_reference(&ib.buffer, NULL);
> return;
> }
>
> util_shorten_ubyte_elts_to_userptr(&sctx->b.b, &ib, 0,
> ib.offset + start_offset,
> count, ptr);
>
> @@ -1072,21 +1073,22 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
> ib.buffer = out_buffer;
> /* info->start will be added by the drawing code */
> ib.offset = out_offset - start_offset;
> ib.index_size = 2;
> } else if (ib.user_buffer && !ib.buffer) {
> unsigned start, count, start_offset;
>
> si_get_draw_start_count(sctx, info, &start, &count);
> start_offset = start * ib.index_size;
>
> - u_upload_data(sctx->b.uploader, start_offset, count * ib.index_size,
> + u_upload_data(ctx->stream_uploader, start_offset,
> + count * ib.index_size,
> 256, (char*)ib.user_buffer + start_offset,
> &ib.offset, &ib.buffer);
> if (!ib.buffer)
> return;
> /* info->start will be added by the drawing code */
> ib.offset -= start_offset;
> }
> }
>
> /* VI reads index buffers through TC L2. */
> --
> 2.7.4
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list