[Mesa-dev] [PATCH v2] r600g: Implement ARB_draw_indirect for EG/CM
Marek Olšák
maraeo at gmail.com
Sat Dec 13 04:52:03 PST 2014
1) You're using magic numbers instead of the actual packet definitions.
2) index_bias and start is also set in vgt_indx_offset of vgt_state.
This won't work if index_bias or start are non-zero. It should be set
to zero for indirect drawing.
Marek
On Sat, Dec 13, 2014 at 3:40 AM, Glenn Kennard <glenn.kennard at gmail.com> wrote:
> Requires Evergreen/Cayman and updated radeon kernel module
>
> Signed-off-by: Glenn Kennard <glenn.kennard at gmail.com>
> ---
> Changes since V1:
> * Fixed 8 bit index case, only triggerable using GLES 3.1 which isn't supported yet
> * Don't read info struct values that have no meaning for indirect case
> * Don't update start_instance/instance_count for indirect cases
> * Use bool expression directly in get_param
>
> Benjamin, the #defines are essentially used, but due to a header conflict
> its not possible to include them in this file. Would have broken the indirect cases
> into evergreen_state.c, but this is a performance-sensitive section of code and
> inlining is critical, so did the next best thing and typed out the define names
> as comments.
>
> Thanks Marek/Benjamin for V1 review
>
> docs/GL3.txt | 4 +-
> docs/relnotes/10.5.0.html | 1 +
> src/gallium/drivers/r600/evergreend.h | 6 +-
> src/gallium/drivers/r600/r600_pipe.c | 4 +-
> src/gallium/drivers/r600/r600_state_common.c | 116 ++++++++++++++++++++++-----
> 5 files changed, 105 insertions(+), 26 deletions(-)
>
> diff --git a/docs/GL3.txt b/docs/GL3.txt
> index 648f5ac..435054a 100644
> --- a/docs/GL3.txt
> +++ b/docs/GL3.txt
> @@ -95,7 +95,7 @@ GL 3.3, GLSL 3.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, soft
> GL 4.0, GLSL 4.00:
>
> GL_ARB_draw_buffers_blend DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
> - GL_ARB_draw_indirect DONE (i965, nvc0, radeonsi, llvmpipe, softpipe)
> + GL_ARB_draw_indirect DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
> GL_ARB_gpu_shader5 DONE (i965, nvc0)
> - 'precise' qualifier DONE
> - Dynamically uniform sampler array indices DONE (r600)
> @@ -159,7 +159,7 @@ GL 4.3, GLSL 4.30:
> GL_ARB_framebuffer_no_attachments not started
> GL_ARB_internalformat_query2 not started
> GL_ARB_invalidate_subdata DONE (all drivers)
> - GL_ARB_multi_draw_indirect DONE (i965, nvc0, radeonsi, llvmpipe, softpipe)
> + GL_ARB_multi_draw_indirect DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
> GL_ARB_program_interface_query not started
> GL_ARB_robust_buffer_access_behavior not started
> GL_ARB_shader_image_size not started
> diff --git a/docs/relnotes/10.5.0.html b/docs/relnotes/10.5.0.html
> index 2987d53..72bb791 100644
> --- a/docs/relnotes/10.5.0.html
> +++ b/docs/relnotes/10.5.0.html
> @@ -49,6 +49,7 @@ Note: some of the new features are only available with certain drivers.
> <li>GL_EXT_packed_float on freedreno</li>
> <li>GL_EXT_texture_shared_exponent on freedreno</li>
> <li>GL_EXT_texture_snorm on freedreno</li>
> +<li>GL_ARB_draw_indirect, GL_ARB_multi_draw_indirect on r600</li>
> </ul>
>
>
> diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
> index 4989996..0725f0d 100644
> --- a/src/gallium/drivers/r600/evergreend.h
> +++ b/src/gallium/drivers/r600/evergreend.h
> @@ -64,6 +64,8 @@
> #define R600_TEXEL_PITCH_ALIGNMENT_MASK 0x7
>
> #define PKT3_NOP 0x10
> +#define PKT3_SET_BASE 0x11
> +#define PKT3_INDEX_BUFFER_SIZE 0x13
> #define PKT3_DEALLOC_STATE 0x14
> #define PKT3_DISPATCH_DIRECT 0x15
> #define PKT3_DISPATCH_INDIRECT 0x16
> @@ -72,7 +74,9 @@
> #define PKT3_REG_RMW 0x21
> #define PKT3_COND_EXEC 0x22
> #define PKT3_PRED_EXEC 0x23
> -#define PKT3_START_3D_CMDBUF 0x24
> +#define PKT3_DRAW_INDIRECT 0x24
> +#define PKT3_DRAW_INDEX_INDIRECT 0x25
> +#define PKT3_INDEX_BASE 0x26
> #define PKT3_DRAW_INDEX_2 0x27
> #define PKT3_CONTEXT_CONTROL 0x28
> #define PKT3_DRAW_INDEX_IMMD_BE 0x29
> diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
> index 0b571e4..0d8bac2 100644
> --- a/src/gallium/drivers/r600/r600_pipe.c
> +++ b/src/gallium/drivers/r600/r600_pipe.c
> @@ -313,6 +313,9 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
> return family >= CHIP_CEDAR ? 1 : 0;
> case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
> return family >= CHIP_CEDAR ? 4 : 0;
> + case PIPE_CAP_DRAW_INDIRECT:
> + /* kernel command checker support is also required */
> + return family >= CHIP_CEDAR && rscreen->b.info.drm_minor >= 41;
>
> /* Unsupported features. */
> case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
> @@ -322,7 +325,6 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
> case PIPE_CAP_VERTEX_COLOR_CLAMPED:
> case PIPE_CAP_USER_VERTEX_BUFFERS:
> case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
> - case PIPE_CAP_DRAW_INDIRECT:
> case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
> case PIPE_CAP_SAMPLER_VIEW_TARGET:
> return 0;
> diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
> index 09d8952..57f87d1 100644
> --- a/src/gallium/drivers/r600/r600_state_common.c
> +++ b/src/gallium/drivers/r600/r600_state_common.c
> @@ -1347,7 +1347,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
> unsigned i;
> struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
>
> - if (!info.count && (info.indexed || !info.count_from_stream_output)) {
> + if (!info.indirect && !info.count && (info.indexed || !info.count_from_stream_output)) {
> return;
> }
>
> @@ -1373,19 +1373,44 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
> pipe_resource_reference(&ib.buffer, rctx->index_buffer.buffer);
> ib.user_buffer = rctx->index_buffer.user_buffer;
> ib.index_size = rctx->index_buffer.index_size;
> - ib.offset = rctx->index_buffer.offset + info.start * ib.index_size;
> + ib.offset = rctx->index_buffer.offset;
> + if (!info.indirect) {
> + ib.offset += info.start * ib.index_size;
> + }
>
> /* Translate 8-bit indices to 16-bit. */
> - if (ib.index_size == 1) {
> + if (unlikely(ib.index_size == 1)) {
> struct pipe_resource *out_buffer = NULL;
> unsigned out_offset;
> void *ptr;
> + unsigned start, count;
> +
> + if (likely(!info.indirect)) {
> + start = 0;
> + count = info.count;
> + }
> + else {
> + /* Have to get start/count from indirect buffer, slow path ahead... */
> + struct r600_resource *indirect_resource = (struct r600_resource *)info.indirect;
> + unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource,
> + PIPE_TRANSFER_READ);
> + if (data) {
> + data += info.indirect_offset / sizeof(unsigned);
> + start = data[2] * ib.index_size;
> + count = data[0];
> + rctx->b.ws->buffer_unmap(indirect_resource->cs_buf);
> + }
> + else {
> + start = 0;
> + count = 0;
> + }
> + }
>
> - u_upload_alloc(rctx->b.uploader, 0, info.count * 2,
> + u_upload_alloc(rctx->b.uploader, start, count * 2,
> &out_offset, &out_buffer, &ptr);
>
> util_shorten_ubyte_elts_to_userptr(
> - &rctx->b.b, &ib, 0, ib.offset, info.count, ptr);
> + &rctx->b.b, &ib, 0, ib.offset + start, count, ptr);
>
> pipe_resource_reference(&ib.buffer, NULL);
> ib.user_buffer = NULL;
> @@ -1397,9 +1422,11 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
> /* Upload the index buffer.
> * The upload is skipped for small index counts on little-endian machines
> * and the indices are emitted via PKT3_DRAW_INDEX_IMMD.
> + * Indirect draws never use immediate indices.
> * Note: Instanced rendering in combination with immediate indices hangs. */
> - if (ib.user_buffer && (R600_BIG_ENDIAN || info.instance_count > 1 ||
> - info.count*ib.index_size > 20)) {
> + if (ib.user_buffer && (R600_BIG_ENDIAN || info.indirect ||
> + info.instance_count > 1 ||
> + info.count*ib.index_size > 20)) {
> u_upload_data(rctx->b.uploader, 0, info.count * ib.index_size,
> ib.user_buffer, &ib.offset, &ib.buffer);
> ib.user_buffer = NULL;
> @@ -1479,7 +1506,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
> }
>
> /* Update start instance. */
> - if (rctx->last_start_instance != info.start_instance) {
> + if (!info.indirect && rctx->last_start_instance != info.start_instance) {
> r600_write_ctl_const(cs, R_03CFF4_SQ_VTX_START_INST_LOC, info.start_instance);
> rctx->last_start_instance = info.start_instance;
> }
> @@ -1504,8 +1531,25 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
> }
>
> /* Draw packets. */
> - cs->buf[cs->cdw++] = PKT3(PKT3_NUM_INSTANCES, 0, rctx->b.predicate_drawing);
> - cs->buf[cs->cdw++] = info.instance_count;
> + if (!info.indirect) {
> + cs->buf[cs->cdw++] = PKT3(PKT3_NUM_INSTANCES, 0, rctx->b.predicate_drawing);
> + cs->buf[cs->cdw++] = info.instance_count;
> + }
> +
> + if (unlikely(info.indirect)) {
> + uint64_t va = r600_resource(info.indirect)->gpu_address;
> + assert(rctx->b.chip_class >= EVERGREEN);
> + cs->buf[cs->cdw++] = PKT3(0x11 /* PKT3_SET_BASE */, 2, rctx->b.predicate_drawing);
> + cs->buf[cs->cdw++] = 1; // 1 means DX11 Draw_Index_Indirect Patch Table Base
> + cs->buf[cs->cdw++] = va;
> + cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
> +
> + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, rctx->b.predicate_drawing);
> + cs->buf[cs->cdw++] = r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx,
> + (struct r600_resource*)info.indirect,
> + RADEON_USAGE_READ, RADEON_PRIO_MIN);
> + }
> +
> if (info.indexed) {
> cs->buf[cs->cdw++] = PKT3(PKT3_INDEX_TYPE, 0, rctx->b.predicate_drawing);
> cs->buf[cs->cdw++] = ib.index_size == 4 ?
> @@ -1522,18 +1566,40 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
> cs->cdw += size_dw;
> } else {
> uint64_t va = r600_resource(ib.buffer)->gpu_address + ib.offset;
> - cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX, 3, rctx->b.predicate_drawing);
> - cs->buf[cs->cdw++] = va;
> - cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
> - cs->buf[cs->cdw++] = info.count;
> - cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_DMA;
> - cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, rctx->b.predicate_drawing);
> - cs->buf[cs->cdw++] = r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx,
> - (struct r600_resource*)ib.buffer,
> - RADEON_USAGE_READ, RADEON_PRIO_MIN);
> +
> + if (likely(!info.indirect)) {
> + cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX, 3, rctx->b.predicate_drawing);
> + cs->buf[cs->cdw++] = va;
> + cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
> + cs->buf[cs->cdw++] = info.count;
> + cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_DMA;
> + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, rctx->b.predicate_drawing);
> + cs->buf[cs->cdw++] = r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx,
> + (struct r600_resource*)ib.buffer,
> + RADEON_USAGE_READ, RADEON_PRIO_MIN);
> + }
> + else {
> + uint32_t max_size = (ib.buffer->width0 - ib.offset) / ib.index_size;
> +
> + cs->buf[cs->cdw++] = PKT3(0x26 /* PKT3_INDEX_BASE */, 1, rctx->b.predicate_drawing);
> + cs->buf[cs->cdw++] = va;
> + cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
> +
> + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, rctx->b.predicate_drawing);
> + cs->buf[cs->cdw++] = r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx,
> + (struct r600_resource*)ib.buffer,
> + RADEON_USAGE_READ, RADEON_PRIO_MIN);
> +
> + cs->buf[cs->cdw++] = PKT3(0x13 /* PKT3_INDEX_BUFFER_SIZE */, 0, rctx->b.predicate_drawing);
> + cs->buf[cs->cdw++] = max_size;
> +
> + cs->buf[cs->cdw++] = PKT3(0x25 /* PKT3_DRAW_INDEX_INDIRECT */, 1, rctx->b.predicate_drawing);
> + cs->buf[cs->cdw++] = info.indirect_offset;
> + cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_DMA;
> + }
> }
> } else {
> - if (info.count_from_stream_output) {
> + if (unlikely(info.count_from_stream_output)) {
> struct r600_so_target *t = (struct r600_so_target*)info.count_from_stream_output;
> uint64_t va = t->buf_filled_size->gpu_address + t->buf_filled_size_offset;
>
> @@ -1552,8 +1618,14 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
> RADEON_PRIO_MIN);
> }
>
> - cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_AUTO, 1, rctx->b.predicate_drawing);
> - cs->buf[cs->cdw++] = info.count;
> + if (likely(!info.indirect)) {
> + cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_AUTO, 1, rctx->b.predicate_drawing);
> + cs->buf[cs->cdw++] = info.count;
> + }
> + else {
> + cs->buf[cs->cdw++] = PKT3(0x24 /* PKT3_DRAW_INDIRECT */, 1, rctx->b.predicate_drawing);
> + cs->buf[cs->cdw++] = info.indirect_offset;
> + }
> cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_AUTO_INDEX |
> (info.count_from_stream_output ? S_0287F0_USE_OPAQUE(1) : 0);
> }
> --
> 1.9.1
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list