[Mesa-dev] [PATCH 3/3] radeonsi/gfx9: implement primitive binning

Mon Sep 4 17:41:33 UTC 2017

On Mon, Sep 4, 2017 at 3:56 PM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> On 01.09.2017 02:57, Marek Olšák wrote:
>>
>> From: Marek Olšák <marek.olsak at amd.com>
>>
>> This increases performance, but it was tuned for Raven, not Vega.
>> We don't know yet how Vega will perform, hopefully not worse.
>> ---
>>   src/gallium/drivers/radeon/r600_pipe_common.c   |   2 +
>>   src/gallium/drivers/radeon/r600_pipe_common.h   |   2 +
>>   src/gallium/drivers/radeonsi/Makefile.sources   |   1 +
>>   src/gallium/drivers/radeonsi/si_hw_context.c    |   2 +
>>   src/gallium/drivers/radeonsi/si_pipe.c          |   5 +
>>   src/gallium/drivers/radeonsi/si_pipe.h          |   2 +
>>   src/gallium/drivers/radeonsi/si_state.c         |  26 +-
>>   src/gallium/drivers/radeonsi/si_state.h         |   6 +-
>>   src/gallium/drivers/radeonsi/si_state_binning.c | 448
>> ++++++++++++++++++++++++
>>   src/gallium/drivers/radeonsi/si_state_shaders.c |   2 +
>>   10 files changed, 489 insertions(+), 7 deletions(-)
>>   create mode 100644 src/gallium/drivers/radeonsi/si_state_binning.c
>>
> [snip]
>
>> diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c
>> b/src/gallium/drivers/radeonsi/si_state_binning.c
>> new file mode 100644
>> index 0000000..56bcdc8
>> --- /dev/null
>> +++ b/src/gallium/drivers/radeonsi/si_state_binning.c
>> @@ -0,0 +1,448 @@
>> +/*
>> + * Copyright 2017 Advanced Micro Devices, Inc.
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining
>> a
>> + * copy of this software and associated documentation files (the
>> "Software"),
>> + * to deal in the Software without restriction, including without
>> limitation
>> + * on the rights to use, copy, modify, merge, publish, distribute, sub
>> + * license, and/or sell copies of the Software, and to permit persons to
>> whom
>> + * the Software is furnished to do so, subject to the following
>> conditions:
>> + *
>> + * The above copyright notice and this permission notice (including the
>> next
>> + * paragraph) shall be included in all copies or substantial portions of
>> the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>> EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>> MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT
>> SHALL
>> + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
>> + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
>> + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
>> THE
>> + * USE OR OTHER DEALINGS IN THE SOFTWARE.
>> + */
>> +
>> +/* This file handles register programming of primitive binning. */
>> +
>> +#include "si_pipe.h"
>> +#include "sid.h"
>> +#include "gfx9d.h"
>> +#include "radeon/r600_cs.h"
>> +
>> +struct uvec2 {
>> +       unsigned x, y;
>> +};
>> +
>> +struct si_bin_size_map {
>> +       unsigned start;
>> +       unsigned bin_size_x;
>> +       unsigned bin_size_y;
>> +};
>> +
>> +typedef struct si_bin_size_map si_bin_size_subtable[3][9];
>> +
>> +/* Find the bin size where sum is >= table[i].start and < table[i +
>> 1].start. */
>> +static struct uvec2 si_find_bin_size(struct si_screen *sscreen,
>> +                                    const si_bin_size_subtable table[],
>> +                                    unsigned sum)
>> +{
>> +       unsigned log_num_rb_per_se =
>> +               util_logbase2_ceil(sscreen->b.info.num_render_backends /
>> +                                  sscreen->b.info.max_se);
>> +       unsigned log_num_se = util_logbase2_ceil(sscreen->b.info.max_se);
>> +       unsigned i;
>> +
>> +       /* Get the chip-specific subtable. */
>> +       const struct si_bin_size_map *subtable =
>> +               &table[log_num_rb_per_se][log_num_se][0];
>> +
>> +       for (i = 0; subtable[i].bin_size_x != 0; i++) {
>> +               if (sum >= subtable[i].start && sum < subtable[i +
>> 1].start)
>> +                       break;
>> +       }
>> +
>> +       struct uvec2 size = {subtable[i].bin_size_x,
>> subtable[i].bin_size_y};
>> +       return size;
>> +}
>> +
>> +static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
>> +                                         unsigned cb_target_enabled_4bit)
>> +{
>> +       unsigned nr_samples = sctx->framebuffer.nr_samples;
>> +       unsigned sum = 0;
>> +
>> +       /* Compute the sum of all Bpp. */
>> +       for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
>> +               if (!(cb_target_enabled_4bit & (0xf << (i * 4))))
>> +                       continue;
>> +
>> +               struct r600_texture *rtex =
>> +                       (struct
>> r600_texture*)sctx->framebuffer.state.cbufs[i]->texture;
>> +               sum += rtex->surface.bpe;
>> +       }
>
>
> I believe this should early-out for !sum, for depth-only rendering.

See the table. There are different values for depth-only rendering
depending on the number of SEs / RBs. I don't why the depth function
has early-out but not the color function.

>
>
>> +
>> +       /* Multiply the sum by some function of the number of samples. */
>> +       if (nr_samples >= 2) {
>> +               if (sctx->ps_iter_samples >= 2)
>> +                       sum *= nr_samples;
>> +               else
>> +                       sum *= 2;
>> +       }
>> +
>> +       static const si_bin_size_subtable table[] = {
>> +               {
>> +                       /* One RB / SE */
>> +                       {
>> +                               /* One shader engine */
>> +                               {        0,  128,  128 },
>> +                               {        1,   64,  128 },
>> +                               {        2,   32,  128 },
>> +                               {        3,   16,  128 },
>> +                               {       17,    0,    0 },
>> +                               { UINT_MAX,    0,    0 },
>
>
> All the UINT_MAX lines in this table should be unnecessary, given that
> si_find_bin_size bails out when it sees bin_size_x == 0.

Will fix.

>
>
>
>> +                       },
>> +                       {
>> +                               /* Two shader engines */
>> +                               {        0,  128,  128 },
>> +                               {        2,   64,  128 },
>> +                               {        3,   32,  128 },
>> +                               {        5,   16,  128 },
>> +                               {       17,    0,    0 },
>> +                               { UINT_MAX,    0,    0 },
>> +                       },
>> +                       {
>> +                               /* Four shader engines */
>> +                               {        0,  128,  128 },
>> +                               {        3,   64,  128 },
>> +                               {        5,   16,  128 },
>> +                               {       17,    0,    0 },
>> +                               { UINT_MAX,    0,    0 },
>> +                       },
>> +               },
>> +               {
>> +                       /* Two RB / SE */
>> +                       {
>> +                               /* One shader engine */
>> +                               {        0,  128,  128 },
>> +                               {        2,   64,  128 },
>> +                               {        3,   32,  128 },
>> +                               {        5,   16,  128 },
>> +                               {       33,    0,    0 },
>> +                               { UINT_MAX,    0,    0 },
>> +                       },
>> +                       {
>> +                               /* Two shader engines */
>> +                               {        0,  128,  128 },
>> +                               {        3,   64,  128 },
>> +                               {        5,   32,  128 },
>> +                               {        9,   16,  128 },
>> +                               {       33,    0,    0 },
>> +                               { UINT_MAX,    0,    0 },
>> +                       },
>> +                       {
>> +                               /* Four shader engines */
>> +                               {        0,  256,  256 },
>> +                               {        2,  128,  256 },
>> +                               {        3,  128,  128 },
>> +                               {        5,   64,  128 },
>> +                               {        9,   16,  128 },
>> +                               {       33,    0,    0 },
>> +                               { UINT_MAX,    0,    0 },
>> +                       },
>> +               },
>> +               {
>> +                       /* Four RB / SE */
>> +                       {
>> +                               /* One shader engine */
>> +                               {        0,  128,  256 },
>> +                               {        2,  128,  128 },
>> +                               {        3,   64,  128 },
>> +                               {        5,   32,  128 },
>> +                               {        9,   16,  128 },
>> +                               {       33,    0,    0 },
>> +                               { UINT_MAX,    0,    0 },
>> +                       },
>> +                       {
>> +                               /* Two shader engines */
>> +                               {        0,  256,  256 },
>> +                               {        2,  128,  256 },
>> +                               {        3,  128,  128 },
>> +                               {        5,   64,  128 },
>> +                               {        9,   32,  128 },
>> +                               {       17,   16,  128 },
>> +                               {       33,    0,    0 },
>> +                               { UINT_MAX,    0,    0 },
>> +                       },
>> +                       {
>> +                               /* Four shader engines */
>> +                               {        0,  256,  512 },
>> +                               {        2,  256,  256 },
>> +                               {        3,  128,  256 },
>> +                               {        5,  128,  128 },
>> +                               {        9,   64,  128 },
>> +                               {       17,   16,  128 },
>> +                               {       33,    0,    0 },
>> +                               { UINT_MAX,    0,    0 },
>> +                       },
>> +               },
>> +       };
>> +
>> +       return si_find_bin_size(sctx->screen, table, sum);
>> +}
>> +
>> +static struct uvec2 si_get_depth_bin_size(struct si_context *sctx)
>> +{
>> +       struct si_state_dsa *dsa = sctx->queued.named.dsa;
>> +
>> +       if (!sctx->framebuffer.state.zsbuf ||
>> +           (!dsa->depth_enabled && !dsa->stencil_enabled)) {
>> +               /* Return the max size. */
>> +               struct uvec2 size = {512, 512};
>> +               return size;
>> +       }
>> +
>> +       struct r600_texture *rtex =
>> +               (struct
>> r600_texture*)sctx->framebuffer.state.zsbuf->texture;
>> +       unsigned depth_coeff = dsa->depth_enabled ? 5 : 0;
>> +       unsigned stencil_coeff = rtex->surface.flags & RADEON_SURF_SBUFFER
>> &&
>> +                                dsa->stencil_enabled ? 1 : 0;
>> +       unsigned sum = 4 * (depth_coeff + stencil_coeff) *
>> +                      sctx->framebuffer.nr_samples;
>
>
> sum will always be at least 4, which means that the first two entries in
> each sub-table below will never be used. Can we remove them? (The tables
> originate from internal docs, but I see you followed the Vulkan lead of
> removing some entries already anyway, so...)

I just copied the code from Vulkan. I'd like to keep it similar for
easy porting.

>
> Also, do you understand where the 5 comes from? A reasonable guess is HTILE,
> but that uses much less than 1/4 the memory of raw Z data.

Not sure if it matters, but HTILE overhead isn't just the added memory
size. I think it also adds latency, because HTILE adds a level of
indirection.

Is re-sending necessary?

Marek