[Mesa-dev] [PATCH 08/11] nvc0: add indirect compute support on Kepler

Mon Feb 29 17:25:30 UTC 2016

On 02/27/2016 11:29 PM, Ilia Mirkin wrote:
> On Sat, Feb 27, 2016 at 9:02 AM, Samuel Pitoiset
> <samuel.pitoiset at gmail.com> wrote:
>> The grid size is stored as three 32-bits integers in the indirect
>> buffer but the launch descriptor uses a 32-bits integer for both
>> griddim_y and griddim_z like this (z << 16) | y. To make it work,
>> the 16 high bits of griddim_y are overwritten by griddim_z.
>>
>> Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
>> ---
>>   src/gallium/drivers/nouveau/nvc0/nve4_compute.c | 113 +++++++++++++++++-------
>>   1 file changed, 81 insertions(+), 32 deletions(-)
>>
>> diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
>> index 3932e89..1faef23 100644
>> --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
>> +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
>> @@ -425,9 +425,7 @@ nve4_compute_state_validate(struct nvc0_context *nvc0)
>>   static void
>>   nve4_compute_upload_input(struct nvc0_context *nvc0,
>>                             struct nve4_cp_launch_desc *desc,
>> -                          const void *input,
>> -                          const uint *block_layout,
>> -                          const uint *grid_layout)
>> +                          const struct pipe_grid_info *info)
>>   {
>>      struct nvc0_screen *screen = nvc0->screen;
>>      struct nouveau_pushbuf *push = nvc0->base.pushbuf;
>> @@ -445,7 +443,7 @@ nve4_compute_upload_input(struct nvc0_context *nvc0,
>>         PUSH_DATA (push, 0x1);
>>         BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (cp->parm_size / 4));
>>         PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
>> -      PUSH_DATAp(push, input, cp->parm_size / 4);
>> +      PUSH_DATAp(push, info->input, cp->parm_size / 4);
>>
>>         /* Bind user parameters coming from clover. */
>>         /* TODO: This should be harmonized with uniform_bo. */
>> @@ -460,8 +458,17 @@ nve4_compute_upload_input(struct nvc0_context *nvc0,
>>      PUSH_DATA (push, 0x1);
>>      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
>>      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
>> -   PUSH_DATAp(push, block_layout, 3);
>> -   PUSH_DATAp(push, grid_layout, 3);
>> +   PUSH_DATAp(push, info->block, 3);
>> +   if (unlikely(info->indirect)) {
>> +      struct nv04_resource *res = nv04_resource(info->indirect);
>> +      uint32_t offset = res->offset + info->indirect_offset;
>> +
>> +      PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
>> +      nouveau_pushbuf_data(push, res->bo, offset,
>> +                           NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4);
>> +   } else {
>> +      PUSH_DATAp(push, info->grid, 3);
>> +   }
>>      PUSH_DATA (push, 0);
>>
>>      BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
>> @@ -481,9 +488,7 @@ nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
>>   static void
>>   nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
>>                                  struct nve4_cp_launch_desc *desc,
>> -                               uint32_t label,
>> -                               const uint *block_layout,
>> -                               const uint *grid_layout)
>> +                               const struct pipe_grid_info *info)
>>   {
>>      const struct nvc0_screen *screen = nvc0->screen;
>>      struct nouveau_pushbuf *push = nvc0->base.pushbuf;
>> @@ -493,14 +498,14 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
>>
>>      nve4_cp_launch_desc_init_default(desc);
>>
>> -   desc->entry = nvc0_program_symbol_offset(cp, label);
>> +   desc->entry = nvc0_program_symbol_offset(cp, info->pc);
>>
>> -   desc->griddim_x = grid_layout[0];
>> -   desc->griddim_y = grid_layout[1];
>> -   desc->griddim_z = grid_layout[2];
>> -   desc->blockdim_x = block_layout[0];
>> -   desc->blockdim_y = block_layout[1];
>> -   desc->blockdim_z = block_layout[2];
>> +   desc->griddim_x = info->grid[0];
>> +   desc->griddim_y = info->grid[1];
>> +   desc->griddim_z = info->grid[2];
>> +   desc->blockdim_x = info->block[0];
>> +   desc->blockdim_y = info->block[1];
>> +   desc->blockdim_z = info->block[2];
>>
>>      desc->shared_size = align(cp->cp.smem_size, 0x100);
>>      desc->local_size_p = align(cp->cp.lmem_size, 0x10);
>> @@ -585,30 +590,74 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
>>      if (ret)
>>         goto out;
>>
>> -   nve4_compute_setup_launch_desc(nvc0, desc, info->pc,
>> -                                  info->block, info->grid);
>> +   nve4_compute_setup_launch_desc(nvc0, desc, info);
>>
>> -   nve4_compute_upload_input(nvc0, desc, info->input, info->block, info->grid);
>> +   nve4_compute_upload_input(nvc0, desc, info);
>>
>>   #ifdef DEBUG
>>      if (debug_get_num_option("NV50_PROG_DEBUG", 0))
>>         nve4_compute_dump_launch_desc(desc);
>>   #endif
>>
>> +   if (unlikely(info->indirect)) {
>> +      struct nv04_resource *res = nv04_resource(info->indirect);
>> +      uint32_t offset = res->offset + info->indirect_offset;
>> +
>> +      /* upload the first part of the descriptor */
>> +      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
>> +      PUSH_DATAh(push, desc_gpuaddr);
>> +      PUSH_DATA (push, desc_gpuaddr);
>> +      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
>> +      PUSH_DATA (push, 48);
>> +      PUSH_DATA (push, 1);
>> +      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (48 / 4));
>> +      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
>> +      PUSH_DATAp(push, (const uint32_t *)desc, 48 / 4);
>> +
>> +      /* upload griddim_x and griddim_y as two 32-bits integers even
>> +       * if griddim_y must be a 16-bits integer */
>> +      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
>> +      PUSH_DATAh(push, desc_gpuaddr + 48);
>> +      PUSH_DATA (push, desc_gpuaddr + 48);
>> +      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
>> +      PUSH_DATA (push, 8);
>> +      PUSH_DATA (push, 1);
>> +      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (8 / 4));
>> +      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
>> +      nouveau_pushbuf_space(push, 16, 0, 1);
>> +      PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
>> +      nouveau_pushbuf_data(push, res->bo, offset,
>> +                           NVC0_IB_ENTRY_1_NO_PREFETCH | 2 * 4);
>> +
>> +      /* overwrite the 16 high bits of griddim_y with griddim_z because
>> +       * we need (z << 16) | x */
>> +      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
>> +      PUSH_DATAh(push, desc_gpuaddr + 54);
>> +      PUSH_DATA (push, desc_gpuaddr + 54);
>> +      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
>> +      PUSH_DATA (push, 4);
>> +      PUSH_DATA (push, 1);
>> +      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (4 / 4));
>> +      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
>> +      nouveau_pushbuf_data(push, res->bo, offset + 8,
>> +                           NVC0_IB_ENTRY_1_NO_PREFETCH | 1 * 4);
>> +
>> +      /* upload the last part of the descriptor */
>> +      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
>> +      PUSH_DATAh(push, desc_gpuaddr + 56);
>> +      PUSH_DATA (push, desc_gpuaddr + 56);
>> +      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
>> +      PUSH_DATA (push, 200);
>> +      PUSH_DATA (push, 1);
>> +      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (200 / 4));
>> +      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
>> +      PUSH_DATAp(push, (const uint32_t *)desc + (56 / 4), 200 / 4);
>
> I guess no harm done doing it this way, but it might be easier to just
> upload the whole thing in one go and then overwrite a few bits. This
> should "auto" happen since I'm guessing nve4_compute_upload_input()
> already uploads the descriptor?

nve4_compute_upload_input() doesn't upload the descriptor. I followed 
your advice here and it works as before. Just upload the whole 
descriptor and then overwrite what we want before flushing.

>
>> +
>> +      BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
>> +      PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB | NVE4_COMPUTE_FLUSH_CODE);
>> +   }
>> +
>>      /* upload descriptor and flush */
>> -#if 0
>> -   BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
>> -   PUSH_DATAh(push, desc_gpuaddr);
>> -   PUSH_DATA (push, desc_gpuaddr);
>> -   BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
>> -   PUSH_DATA (push, 256);
>> -   PUSH_DATA (push, 1);
>> -   BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4));
>> -   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
>> -   PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
>> -   BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
>> -   PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB | NVE4_COMPUTE_FLUSH_CODE);
>> -#endif
>>      BEGIN_NVC0(push, NVE4_CP(LAUNCH_DESC_ADDRESS), 1);
>>      PUSH_DATA (push, desc_gpuaddr >> 8);
>>      BEGIN_NVC0(push, NVE4_CP(LAUNCH), 1);
>> --
>> 2.7.1
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev

-- 
-Samuel