[Mesa-dev] [PATCH v3 13/14] radeonsi: Process multiple patches per threadgroup.

Thu May 26 14:52:28 UTC 2016

Patches 12-13:

Reviewed-by: Marek Olšák <marek.olsak at amd.com>

Marek

On Thu, May 26, 2016 at 3:33 PM, Bas Nieuwenhuizen
<bas at basnieuwenhuizen.nl> wrote:
> Using more than 1 wave per threadgroup does increase performance
> generally.  Not using too many patches per threadgroup also
> increases performance. Both catalyst and amdgpu-pro seem to
> use 40 patches as their maximum, but I haven't really seen
> any performance increase from limiting the number of patches
> to 40 instead of 64.
>
> Note that the trick where we overlap the input and output LDS
> does not work anymore as the insertion of the tess factors
> changes the patch stride.
>
> v2: - Add comment about LDS assumptions.
>     - Add constant for buffer size.
>     - Fix code style.
>
> v3: - Correct limits for not splitting patches between waves.
>     - Set max num_patches to 40 as in the proprietary driver.
>
> Signed-off-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
> ---
>  src/gallium/drivers/radeonsi/si_state_draw.c | 50 +++++++++++++++++++---------
>  1 file changed, 35 insertions(+), 15 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index 6fe2619..c8b87a9 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -108,20 +108,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
>         unsigned input_patch_size, output_patch_size, output_patch0_offset;
>         unsigned perpatch_output_offset, lds_size, ls_rsrc2;
>         unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets;
> -       unsigned offchip_layout;
> -
> -       *num_patches = 1; /* TODO: calculate this */
> -
> -       if (sctx->last_ls == ls->current &&
> -           sctx->last_tcs == tcs &&
> -           sctx->last_tes_sh_base == tes_sh_base &&
> -           sctx->last_num_tcs_input_cp == num_tcs_input_cp)
> -               return;
> -
> -       sctx->last_ls = ls->current;
> -       sctx->last_tcs = tcs;
> -       sctx->last_tes_sh_base = tes_sh_base;
> -       sctx->last_num_tcs_input_cp = num_tcs_input_cp;
> +       unsigned offchip_layout, hardware_lds_size;
>
>         /* This calculates how shader inputs and outputs among VS, TCS, and TES
>          * are laid out in LDS. */
> @@ -146,7 +133,29 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
>         pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
>         output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
>
> -       output_patch0_offset = sctx->tcs_shader.cso ? input_patch_size * *num_patches : 0;
> +       /* Ensure that we only need one wave per SIMD so we don't need to check
> +        * resource usage. Also ensures that the number of tcs in and out
> +        * vertices per threadgroup is at most 256.
> +        */
> +       *num_patches = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp) * 4;
> +
> +       /* Make sure that the data fits in LDS. This assumes the shaders only
> +        * use LDS for the inputs and outputs.
> +        */
> +       hardware_lds_size = sctx->b.chip_class >= CIK ? 65536 : 32768;
> +       *num_patches = MIN2(*num_patches, hardware_lds_size / (input_patch_size +
> +                                                              output_patch_size));
> +
> +       /* Make sure the output data fits in the offchip buffer */
> +       *num_patches = MIN2(*num_patches, SI_TESS_OFFCHIP_BLOCK_SIZE /
> +                                         output_patch_size);
> +
> +       /* Not necessary for correctness, but improves performance. The
> +        * specific value is taken from the proprietary driver.
> +        */
> +       *num_patches = MIN2(*num_patches, 40);
> +
> +       output_patch0_offset = input_patch_size * *num_patches;
>         perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
>
>         lds_size = output_patch0_offset + output_patch_size * *num_patches;
> @@ -160,6 +169,17 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
>                 ls_rsrc2 |= S_00B52C_LDS_SIZE(align(lds_size, 256) / 256);
>         }
>
> +       if (sctx->last_ls == ls->current &&
> +           sctx->last_tcs == tcs &&
> +           sctx->last_tes_sh_base == tes_sh_base &&
> +           sctx->last_num_tcs_input_cp == num_tcs_input_cp)
> +               return;
> +
> +       sctx->last_ls = ls->current;
> +       sctx->last_tcs = tcs;
> +       sctx->last_tes_sh_base = tes_sh_base;
> +       sctx->last_num_tcs_input_cp = num_tcs_input_cp;
> +
>         /* Due to a hw bug, RSRC2_LS must be written twice with another
>          * LS register written in between. */
>         if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
> --
> 2.8.3
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev