<p dir="ltr"><br>
On Mar 31, 2016 12:09 PM, "Samuel Pitoiset" <<a href="mailto:samuel.pitoiset@gmail.com">samuel.pitoiset@gmail.com</a>> wrote:<br>
><br>
> The grid size is stored as three 32-bits integers in the indirect<br>
> buffer but the launch descriptor uses a 32-bits integer for both<br>
> griddim_y and griddim_z like this (z << 16) | y. To make it work,<br>
> the 16 high bits of griddim_y are overwritten by griddim_z.<br>
><br>
> Changes from v2:<br>
>  - upload the whole descriptor and overwrite a few bits<br>
><br>
> Signed-off-by: Samuel Pitoiset <<a href="mailto:samuel.pitoiset@gmail.com">samuel.pitoiset@gmail.com</a>><br>
> ---<br>
>  src/gallium/drivers/nouveau/nvc0/nve4_compute.c | 99 +++++++++++++++++--------<br>
>  1 file changed, 67 insertions(+), 32 deletions(-)<br>
><br>
> diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c<br>
> index 4d4808c..1a2afee 100644<br>
> --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c<br>
> +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c<br>
> @@ -435,9 +435,7 @@ nve4_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask)<br>
>  static void<br>
>  nve4_compute_upload_input(struct nvc0_context *nvc0,<br>
>                            struct nve4_cp_launch_desc *desc,<br>
> -                          const void *input,<br>
> -                          const uint *block_layout,<br>
> -                          const uint *grid_layout)<br>
> +                          const struct pipe_grid_info *info)<br>
>  {<br>
>     struct nvc0_screen *screen = nvc0->screen;<br>
>     struct nouveau_pushbuf *push = nvc0->base.pushbuf;<br>
> @@ -455,7 +453,7 @@ nve4_compute_upload_input(struct nvc0_context *nvc0,<br>
>        PUSH_DATA (push, 0x1);<br>
>        BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (cp->parm_size / 4));<br>
>        PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));<br>
> -      PUSH_DATAp(push, input, cp->parm_size / 4);<br>
> +      PUSH_DATAp(push, info->input, cp->parm_size / 4);<br>
><br>
>        /* Bind user parameters coming from clover. */<br>
>        /* TODO: This should be harmonized with uniform_bo. */<br>
> @@ -470,8 +468,17 @@ nve4_compute_upload_input(struct nvc0_context *nvc0,<br>
>     PUSH_DATA (push, 0x1);<br>
>     BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);<br>
>     PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));<br>
> -   PUSH_DATAp(push, block_layout, 3);<br>
> -   PUSH_DATAp(push, grid_layout, 3);<br>
> +   PUSH_DATAp(push, info->block, 3);<br>
> +   if (unlikely(info->indirect)) {<br>
> +      struct nv04_resource *res = nv04_resource(info->indirect);<br>
> +      uint32_t offset = res->offset + info->indirect_offset;<br>
> +<br>
> +      PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);</p>
<p dir="ltr">This can get you into trouble as it might flush things. You need to stick a nouveau_pushbuf_space call before the begin.</p>
<p dir="ltr">> +      nouveau_pushbuf_data(push, res->bo, offset,<br>
> +                           NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4);<br>
> +   } else {<br>
> +      PUSH_DATAp(push, info->grid, 3);<br>
> +   }<br>
>     PUSH_DATA (push, 0);<br>
><br>
>     BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);<br>
> @@ -491,23 +498,21 @@ nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)<br>
>  static void<br>
>  nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,<br>
>                                 struct nve4_cp_launch_desc *desc,<br>
> -                               uint32_t label,<br>
> -                               const uint *block_layout,<br>
> -                               const uint *grid_layout)<br>
> +                               const struct pipe_grid_info *info)<br>
>  {<br>
>     const struct nvc0_screen *screen = nvc0->screen;<br>
>     const struct nvc0_program *cp = nvc0->compprog;<br>
><br>
>     nve4_cp_launch_desc_init_default(desc);<br>
><br>
> -   desc->entry = nvc0_program_symbol_offset(cp, label);<br>
> +   desc->entry = nvc0_program_symbol_offset(cp, info->pc);<br>
><br>
> -   desc->griddim_x = grid_layout[0];<br>
> -   desc->griddim_y = grid_layout[1];<br>
> -   desc->griddim_z = grid_layout[2];<br>
> -   desc->blockdim_x = block_layout[0];<br>
> -   desc->blockdim_y = block_layout[1];<br>
> -   desc->blockdim_z = block_layout[2];<br>
> +   desc->griddim_x = info->grid[0];<br>
> +   desc->griddim_y = info->grid[1];<br>
> +   desc->griddim_z = info->grid[2];<br>
> +   desc->blockdim_x = info->block[0];<br>
> +   desc->blockdim_y = info->block[1];<br>
> +   desc->blockdim_z = info->block[2];<br>
><br>
>     desc->shared_size = align(cp->cp.smem_size, 0x100);<br>
>     desc->local_size_p = align(cp->cp.lmem_size, 0x10);<br>
> @@ -566,30 +571,60 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)<br>
>     if (ret)<br>
>        goto out;<br>
><br>
> -   nve4_compute_setup_launch_desc(nvc0, desc, info->pc,<br>
> -                                  info->block, info->grid);<br>
> +   nve4_compute_setup_launch_desc(nvc0, desc, info);<br>
><br>
> -   nve4_compute_upload_input(nvc0, desc, info->input, info->block, info->grid);<br>
> +   nve4_compute_upload_input(nvc0, desc, info);<br>
><br>
>  #ifdef DEBUG<br>
>     if (debug_get_num_option("NV50_PROG_DEBUG", 0))<br>
>        nve4_compute_dump_launch_desc(desc);<br>
>  #endif<br>
><br>
> +   if (unlikely(info->indirect)) {<br>
> +      struct nv04_resource *res = nv04_resource(info->indirect);<br>
> +      uint32_t offset = res->offset + info->indirect_offset;<br>
> +<br>
> +      /* upload the descriptor */<br>
> +      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);<br>
> +      PUSH_DATAh(push, desc_gpuaddr);<br>
> +      PUSH_DATA (push, desc_gpuaddr);<br>
> +      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);<br>
> +      PUSH_DATA (push, 256);<br>
> +      PUSH_DATA (push, 1);<br>
> +      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4));<br>
> +      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));<br>
> +      PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);<br>
> +<br>
> +      /* overwrite griddim_x and griddim_y as two 32-bits integers even<br>
> +       * if griddim_y must be a 16-bits integer */<br>
> +      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);<br>
> +      PUSH_DATAh(push, desc_gpuaddr + 48);<br>
> +      PUSH_DATA (push, desc_gpuaddr + 48);<br>
> +      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);<br>
> +      PUSH_DATA (push, 8);<br>
> +      PUSH_DATA (push, 1);<br>
> +      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (8 / 4));<br>
> +      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));<br>
> +      nouveau_pushbuf_space(push, 16, 0, 1);<br>
> +      PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);</p>
<p dir="ltr">Right, so like this, but before the begin :)</p>
<p dir="ltr">> +      nouveau_pushbuf_data(push, res->bo, offset,<br>
> +                           NVC0_IB_ENTRY_1_NO_PREFETCH | 2 * 4);<br>
> +<br>
> +      /* overwrite the 16 high bits of griddim_y with griddim_z because<br>
> +       * we need (z << 16) | x */<br>
> +      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);<br>
> +      PUSH_DATAh(push, desc_gpuaddr + 54);<br>
> +      PUSH_DATA (push, desc_gpuaddr + 54);<br>
> +      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);<br>
> +      PUSH_DATA (push, 4);<br>
> +      PUSH_DATA (push, 1);<br>
> +      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (4 / 4));<br>
> +      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));<br>
> +      nouveau_pushbuf_data(push, res->bo, offset + 8,<br>
> +                           NVC0_IB_ENTRY_1_NO_PREFETCH | 1 * 4);<br>
> +   }<br>
> +<br>
>     /* upload descriptor and flush */<br>
> -#if 0<br>
> -   BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);<br>
> -   PUSH_DATAh(push, desc_gpuaddr);<br>
> -   PUSH_DATA (push, desc_gpuaddr);<br>
> -   BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);<br>
> -   PUSH_DATA (push, 256);<br>
> -   PUSH_DATA (push, 1);<br>
> -   BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4));<br>
> -   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));<br>
> -   PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);<br>
> -   BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);<br>
> -   PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB | NVE4_COMPUTE_FLUSH_CODE);<br>
> -#endif<br>
>     BEGIN_NVC0(push, NVE4_CP(LAUNCH_DESC_ADDRESS), 1);<br>
>     PUSH_DATA (push, desc_gpuaddr >> 8);<br>
>     BEGIN_NVC0(push, NVE4_CP(LAUNCH), 1);<br>
> --<br>
> 2.7.4<br>
><br>
> _______________________________________________<br>
> mesa-dev mailing list<br>
> <a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a><br>
> <a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev">https://lists.freedesktop.org/mailman/listinfo/mesa-dev</a><br>
</p>