[Mesa-dev] [PATCH v2 07/13] nvc0: add indirect compute support on Kepler
Ilia Mirkin
imirkin at alum.mit.edu
Fri Apr 1 15:21:39 UTC 2016
On Fri, Apr 1, 2016 at 10:24 AM, Samuel Pitoiset
<samuel.pitoiset at gmail.com> wrote:
>
>
> On 04/01/2016 07:34 AM, Ilia Mirkin wrote:
>>
>>
>> On Mar 31, 2016 12:09 PM, "Samuel Pitoiset" <samuel.pitoiset at gmail.com
>> <mailto:samuel.pitoiset at gmail.com>> wrote:
>> >
>> > The grid size is stored as three 32-bits integers in the indirect
>> > buffer but the launch descriptor uses a 32-bits integer for both
>> > griddim_y and griddim_z like this (z << 16) | y. To make it work,
>> > the 16 high bits of griddim_y are overwritten by griddim_z.
>> >
>> > Changes from v2:
>> > - upload the whole descriptor and overwrite a few bits
>> >
>> > Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com
>> <mailto:samuel.pitoiset at gmail.com>>
>>
>> > ---
>> > src/gallium/drivers/nouveau/nvc0/nve4_compute.c | 99
>> +++++++++++++++++--------
>> > 1 file changed, 67 insertions(+), 32 deletions(-)
>> >
>> > diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
>> b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
>> > index 4d4808c..1a2afee 100644
>> > --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
>> > +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
>> > @@ -435,9 +435,7 @@ nve4_state_validate_cp(struct nvc0_context *nvc0,
>> uint32_t mask)
>> > static void
>> > nve4_compute_upload_input(struct nvc0_context *nvc0,
>> > struct nve4_cp_launch_desc *desc,
>> > - const void *input,
>> > - const uint *block_layout,
>> > - const uint *grid_layout)
>> > + const struct pipe_grid_info *info)
>> > {
>> > struct nvc0_screen *screen = nvc0->screen;
>> > struct nouveau_pushbuf *push = nvc0->base.pushbuf;
>> > @@ -455,7 +453,7 @@ nve4_compute_upload_input(struct nvc0_context
>> *nvc0,
>> > PUSH_DATA (push, 0x1);
>> > BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (cp->parm_size / 4));
>> > PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
>> > - PUSH_DATAp(push, input, cp->parm_size / 4);
>> > + PUSH_DATAp(push, info->input, cp->parm_size / 4);
>> >
>> > /* Bind user parameters coming from clover. */
>> > /* TODO: This should be harmonized with uniform_bo. */
>> > @@ -470,8 +468,17 @@ nve4_compute_upload_input(struct nvc0_context
>> *nvc0,
>> > PUSH_DATA (push, 0x1);
>> > BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
>> > PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
>> > - PUSH_DATAp(push, block_layout, 3);
>> > - PUSH_DATAp(push, grid_layout, 3);
>> > + PUSH_DATAp(push, info->block, 3);
>> > + if (unlikely(info->indirect)) {
>> > + struct nv04_resource *res = nv04_resource(info->indirect);
>> > + uint32_t offset = res->offset + info->indirect_offset;
>> > +
>> > + PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
>>
>> This can get you into trouble as it might flush things. You need to
>> stick a nouveau_pushbuf_space call before the begin.
>
>
> Good catch.
>
>
>>
>> > + nouveau_pushbuf_data(push, res->bo, offset,
>> > + NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4);
>> > + } else {
>> > + PUSH_DATAp(push, info->grid, 3);
>> > + }
>> > PUSH_DATA (push, 0);
>> >
>> > BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
>> > @@ -491,23 +498,21 @@ nve4_compute_derive_cache_split(struct
>> nvc0_context *nvc0, uint32_t shared_size)
>> > static void
>> > nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
>> > struct nve4_cp_launch_desc *desc,
>> > - uint32_t label,
>> > - const uint *block_layout,
>> > - const uint *grid_layout)
>> > + const struct pipe_grid_info *info)
>> > {
>> > const struct nvc0_screen *screen = nvc0->screen;
>> > const struct nvc0_program *cp = nvc0->compprog;
>> >
>> > nve4_cp_launch_desc_init_default(desc);
>> >
>> > - desc->entry = nvc0_program_symbol_offset(cp, label);
>> > + desc->entry = nvc0_program_symbol_offset(cp, info->pc);
>> >
>> > - desc->griddim_x = grid_layout[0];
>> > - desc->griddim_y = grid_layout[1];
>> > - desc->griddim_z = grid_layout[2];
>> > - desc->blockdim_x = block_layout[0];
>> > - desc->blockdim_y = block_layout[1];
>> > - desc->blockdim_z = block_layout[2];
>> > + desc->griddim_x = info->grid[0];
>> > + desc->griddim_y = info->grid[1];
>> > + desc->griddim_z = info->grid[2];
>> > + desc->blockdim_x = info->block[0];
>> > + desc->blockdim_y = info->block[1];
>> > + desc->blockdim_z = info->block[2];
>> >
>> > desc->shared_size = align(cp->cp.smem_size, 0x100);
>> > desc->local_size_p = align(cp->cp.lmem_size, 0x10);
>> > @@ -566,30 +571,60 @@ nve4_launch_grid(struct pipe_context *pipe,
>> const struct pipe_grid_info *info)
>> > if (ret)
>> > goto out;
>> >
>> > - nve4_compute_setup_launch_desc(nvc0, desc, info->pc,
>> > - info->block, info->grid);
>> > + nve4_compute_setup_launch_desc(nvc0, desc, info);
>> >
>> > - nve4_compute_upload_input(nvc0, desc, info->input, info->block,
>> info->grid);
>> > + nve4_compute_upload_input(nvc0, desc, info);
>> >
>> > #ifdef DEBUG
>> > if (debug_get_num_option("NV50_PROG_DEBUG", 0))
>> > nve4_compute_dump_launch_desc(desc);
>> > #endif
>> >
>> > + if (unlikely(info->indirect)) {
>> > + struct nv04_resource *res = nv04_resource(info->indirect);
>> > + uint32_t offset = res->offset + info->indirect_offset;
>> > +
>> > + /* upload the descriptor */
>> > + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
>> > + PUSH_DATAh(push, desc_gpuaddr);
>> > + PUSH_DATA (push, desc_gpuaddr);
>> > + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
>> > + PUSH_DATA (push, 256);
>> > + PUSH_DATA (push, 1);
>> > + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4));
>> > + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
>> > + PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
>> > +
>> > + /* overwrite griddim_x and griddim_y as two 32-bits integers
>> even
>> > + * if griddim_y must be a 16-bits integer */
>> > + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
>> > + PUSH_DATAh(push, desc_gpuaddr + 48);
>> > + PUSH_DATA (push, desc_gpuaddr + 48);
>> > + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
>> > + PUSH_DATA (push, 8);
>> > + PUSH_DATA (push, 1);
>> > + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (8 / 4));
>> > + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
>> > + nouveau_pushbuf_space(push, 16, 0, 1);
>> > + PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
>>
>> Right, so like this, but before the begin :)
>
>
> Mmmh, are you sure it's required to put nouveau_pushbuf_space() before the
> begin? We don't do that for indirect compute on Fermi, or maybe it's also
> broken.
It took me about 37 attempts to get indirect draw right. Take a look
at how indirect draw works now - that is the One True Way. Everything
else might break.
More information about the mesa-dev
mailing list