[Mesa-dev] [PATCH] r600g: don't reserve more stack space than required v4

Thu Feb 21 16:37:54 PST 2013

On 02/22/2013 04:23 AM, Alex Deucher wrote:
> On Thu, Feb 21, 2013 at 6:52 PM, Vadim Girlin <vadimgirlin at gmail.com> wrote:
>> v4: implement exact computation taking into account wavefront size
>>
>> Signed-off-by: Vadim Girlin <vadimgirlin at gmail.com>
>> ---
>>   src/gallium/drivers/r600/r600_asm.c    |  44 +++++++++--
>>   src/gallium/drivers/r600/r600_asm.h    |  24 ++++--
>>   src/gallium/drivers/r600/r600_shader.c | 131 ++++++++++++++++++++++-----------
>>   3 files changed, 142 insertions(+), 57 deletions(-)
>>
>> diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
>> index 3632aa5..f041e27 100644
>> --- a/src/gallium/drivers/r600/r600_asm.c
>> +++ b/src/gallium/drivers/r600/r600_asm.c
>> @@ -86,6 +86,38 @@ static struct r600_bytecode_tex *r600_bytecode_tex(void)
>>          return tex;
>>   }
>>
>> +static unsigned stack_entry_size(enum radeon_family chip) {
>> +       /* Wavefront size:
>> +        *   64: R600/RV670/RV770/Cypress/R740/Barts/Turks/Caicos/
>> +        *       Aruba/Sumo/Sumo2/redwood/juniper
>> +        *   32: R630/R730/R710/Palm/Cedar
>> +        *   16: R610/Rs780
>> +        *
>> +        * Stack row size:
>> +        *      Wavefront Size                        16  32  48  64
>> +        *      Columns per Row (R6xx/R7xx/R8xx only)  8   8   4   4
>> +        *      Columns per Row (R9xx+)                8   4   4   4 */
>> +
>> +       switch (chip) {
>> +       /* FIXME: are some chips missing here? */
>> +       /* wavefront size 16 */
>> +       case CHIP_RV610:
>> +       case CHIP_RS780:
>
> RV620
> RS880
>
> Should be 16 as well.

Thanks, I'll add them.

Vadim

>
>> +       /* wavefront size 32 */
>> +       case CHIP_RV630:
>> +       case CHIP_RV635:
>> +       case CHIP_RV730:
>> +       case CHIP_RV710:
>> +       case CHIP_PALM:
>> +       case CHIP_CEDAR:
>> +               return 8;
>> +
>> +       /* wavefront size 64 */
>> +       default:
>> +               return 4;
>> +       }
>> +}
>> +
>>   void r600_bytecode_init(struct r600_bytecode *bc,
>>                          enum chip_class chip_class,
>>                          enum radeon_family family,
>> @@ -103,6 +135,7 @@ void r600_bytecode_init(struct r600_bytecode *bc,
>>          LIST_INITHEAD(&bc->cf);
>>          bc->chip_class = chip_class;
>>          bc->msaa_texture_mode = msaa_texture_mode;
>> +       bc->stack.entry_size = stack_entry_size(family);
>>   }
>>
>>   static int r600_bytecode_add_cf(struct r600_bytecode *bc)
>> @@ -1524,8 +1557,8 @@ int r600_bytecode_build(struct r600_bytecode *bc)
>>          unsigned addr;
>>          int i, r;
>>
>> -       if (bc->callstack[0].max > 0)
>> -               bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2;
>> +       bc->nstack = bc->stack.max_entries;
>> +
>>          if (bc->type == TGSI_PROCESSOR_VERTEX && !bc->nstack) {
>>                  bc->nstack = 1;
>>          }
>> @@ -1826,8 +1859,8 @@ void r600_bytecode_disasm(struct r600_bytecode *bc)
>>                  chip = '6';
>>                  break;
>>          }
>> -       fprintf(stderr, "bytecode %d dw -- %d gprs ---------------------\n",
>> -               bc->ndw, bc->ngpr);
>> +       fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack -------------\n",
>> +               bc->ndw, bc->ngpr, bc->nstack);
>>          fprintf(stderr, "shader %d -- %c\n", index++, chip);
>>
>>          LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
>> @@ -2105,7 +2138,8 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
>>                  chip = '6';
>>                  break;
>>          }
>> -       fprintf(stderr, "bytecode %d dw -- %d gprs ---------------------\n", bc->ndw, bc->ngpr);
>> +       fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack -------------\n",
>> +               bc->ndw, bc->ngpr, bc->nstack);
>>          fprintf(stderr, "     %c\n", chip);
>>
>>          LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
>> diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
>> index 03cd238..5a9869d 100644
>> --- a/src/gallium/drivers/r600/r600_asm.h
>> +++ b/src/gallium/drivers/r600/r600_asm.h
>> @@ -173,16 +173,25 @@ struct r600_cf_stack_entry {
>>   };
>>
>>   #define SQ_MAX_CALL_DEPTH 0x00000020
>> -struct r600_cf_callstack {
>> -       unsigned                        fc_sp_before_entry;
>> -       int                             sub_desc_index;
>> -       int                             current;
>> -       int                             max;
>> -};
>>
>>   #define AR_HANDLE_NORMAL 0
>>   #define AR_HANDLE_RV6XX 1 /* except RV670 */
>>
>> +struct r600_stack_info {
>> +       /* current level of non-WQM PUSH operations
>> +        * (PUSH, PUSH_ELSE, ALU_PUSH_BEFORE) */
>> +       int push;
>> +       /* current level of WQM PUSH operations
>> +        * (PUSH, PUSH_ELSE, PUSH_WQM) */
>> +       int push_wqm;
>> +       /* current loop level */
>> +       int loop;
>> +
>> +       /* required depth */
>> +       int max_entries;
>> +       /* subentries per entry */
>> +       int entry_size;
>> +};
>>
>>   struct r600_bytecode {
>>          enum chip_class                 chip_class;
>> @@ -199,8 +208,7 @@ struct r600_bytecode {
>>          uint32_t                        *bytecode;
>>          uint32_t                        fc_sp;
>>          struct r600_cf_stack_entry      fc_stack[32];
>> -       unsigned                        call_sp;
>> -       struct r600_cf_callstack        callstack[SQ_MAX_CALL_DEPTH];
>> +       struct r600_stack_info          stack;
>>          unsigned        ar_loaded;
>>          unsigned        ar_reg;
>>          unsigned        ar_chan;
>> diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
>> index 8642463..404aea7 100644
>> --- a/src/gallium/drivers/r600/r600_shader.c
>> +++ b/src/gallium/drivers/r600/r600_shader.c
>> @@ -234,7 +234,7 @@ struct r600_shader_tgsi_instruction {
>>
>>   static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
>>   static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
>> -static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only);
>> +static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
>>   static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
>>   static int tgsi_else(struct r600_shader_ctx *ctx);
>>   static int tgsi_endif(struct r600_shader_ctx *ctx);
>> @@ -412,7 +412,7 @@ static void llvm_if(struct r600_shader_ctx *ctx)
>>   {
>>          r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
>>          fc_pushlevel(ctx, FC_IF);
>> -       callstack_check_depth(ctx, FC_PUSH_VPM, 0);
>> +       callstack_push(ctx, FC_PUSH_VPM);
>>   }
>>
>>   static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx)
>> @@ -5522,63 +5522,107 @@ static int pops(struct r600_shader_ctx *ctx, int pops)
>>          return 0;
>>   }
>>
>> -static inline void callstack_decrease_current(struct r600_shader_ctx *ctx, unsigned reason)
>> +static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
>> +                                              unsigned reason)
>> +{
>> +       struct r600_stack_info *stack = &ctx->bc->stack;
>> +       unsigned elements, entries;
>> +
>> +       unsigned entry_size = stack->entry_size;
>> +
>> +       elements = (stack->loop + stack->push_wqm ) * entry_size;
>> +       elements += stack->push;
>> +
>> +       switch (ctx->bc->chip_class) {
>> +       case R600:
>> +       case R700:
>> +               /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
>> +                * the stack must be reserved to hold the current active/continue
>> +                * masks */
>> +               if (reason == FC_PUSH_VPM) {
>> +                       elements += 2;
>> +               }
>> +               break;
>> +
>> +       case CAYMAN:
>> +               /* r9xx: any stack operation on empty stack consumes 2 additional
>> +                * elements */
>> +               elements += 2;
>> +
>> +               /* fallthrough */
>> +               /* FIXME: do the two elements added above cover the cases for the
>> +                * r8xx+ below? */
>> +
>> +       case EVERGREEN:
>> +               /* r8xx+: 2 extra elements are not always required, but one extra
>> +                * element must be added for each of the following cases:
>> +                * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
>> +                *    stack usage.
>> +                *    (Currently we don't use ALU_ELSE_AFTER.)
>> +                * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
>> +                *    PUSH instruction executed.
>> +                *
>> +                *    NOTE: it seems we also need to reserve additional element in some
>> +                *    other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
>> +                *    then STACK_SIZE should be 2 instead of 1 */
>> +               if (reason == FC_PUSH_VPM) {
>> +                       elements += 1;
>> +               }
>> +               break;
>> +
>> +       default:
>> +               assert(0);
>> +               break;
>> +       }
>> +
>> +       /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
>> +        * for all chips, so we use 4 in the final formula, not the real entry_size
>> +        * for the chip */
>> +       entry_size = 4;
>> +
>> +       entries = (elements + (entry_size - 1)) / entry_size;
>> +
>> +       if (entries > stack->max_entries)
>> +               stack->max_entries = entries;
>> +}
>> +
>> +static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
>>   {
>>          switch(reason) {
>>          case FC_PUSH_VPM:
>> -               ctx->bc->callstack[ctx->bc->call_sp].current--;
>> +               --ctx->bc->stack.push;
>> +               assert(ctx->bc->stack.push >= 0);
>>                  break;
>>          case FC_PUSH_WQM:
>> +               --ctx->bc->stack.push_wqm;
>> +               assert(ctx->bc->stack.push_wqm >= 0);
>> +               break;
>>          case FC_LOOP:
>> -               ctx->bc->callstack[ctx->bc->call_sp].current -= 4;
>> +               --ctx->bc->stack.loop;
>> +               assert(ctx->bc->stack.loop >= 0);
>>                  break;
>> -       case FC_REP:
>> -               /* TOODO : for 16 vp asic should -= 2; */
>> -               ctx->bc->callstack[ctx->bc->call_sp].current --;
>> +       default:
>> +               assert(0);
>>                  break;
>>          }
>>   }
>>
>> -static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only)
>> +static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
>>   {
>> -       if (check_max_only) {
>> -               int diff;
>> -               switch (reason) {
>> -               case FC_PUSH_VPM:
>> -                       diff = 1;
>> -                       break;
>> -               case FC_PUSH_WQM:
>> -                       diff = 4;
>> -                       break;
>> -               default:
>> -                       assert(0);
>> -                       diff = 0;
>> -               }
>> -               if ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) >
>> -                   ctx->bc->callstack[ctx->bc->call_sp].max) {
>> -                       ctx->bc->callstack[ctx->bc->call_sp].max =
>> -                               ctx->bc->callstack[ctx->bc->call_sp].current + diff;
>> -               }
>> -               return;
>> -       }
>>          switch (reason) {
>>          case FC_PUSH_VPM:
>> -               ctx->bc->callstack[ctx->bc->call_sp].current++;
>> +               ++ctx->bc->stack.push;
>>                  break;
>>          case FC_PUSH_WQM:
>> +               ++ctx->bc->stack.push_wqm;
>>          case FC_LOOP:
>> -               ctx->bc->callstack[ctx->bc->call_sp].current += 4;
>> -               break;
>> -       case FC_REP:
>> -               ctx->bc->callstack[ctx->bc->call_sp].current++;
>> +               ++ctx->bc->stack.loop;
>>                  break;
>> +       default:
>> +               assert(0);
>>          }
>>
>> -       if ((ctx->bc->callstack[ctx->bc->call_sp].current) >
>> -           ctx->bc->callstack[ctx->bc->call_sp].max) {
>> -               ctx->bc->callstack[ctx->bc->call_sp].max =
>> -                       ctx->bc->callstack[ctx->bc->call_sp].current;
>> -       }
>> +       callstack_update_max_depth(ctx, reason);
>>   }
>>
>>   static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
>> @@ -5665,7 +5709,7 @@ static int tgsi_if(struct r600_shader_ctx *ctx)
>>
>>          fc_pushlevel(ctx, FC_IF);
>>
>> -       callstack_check_depth(ctx, FC_PUSH_VPM, 0);
>> +       callstack_push(ctx, FC_PUSH_VPM);
>>          return 0;
>>   }
>>
>> @@ -5695,7 +5739,7 @@ static int tgsi_endif(struct r600_shader_ctx *ctx)
>>          }
>>          fc_poplevel(ctx);
>>
>> -       callstack_decrease_current(ctx, FC_PUSH_VPM);
>> +       callstack_pop(ctx, FC_PUSH_VPM);
>>          return 0;
>>   }
>>
>> @@ -5708,7 +5752,7 @@ static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
>>          fc_pushlevel(ctx, FC_LOOP);
>>
>>          /* check stack depth */
>> -       callstack_check_depth(ctx, FC_LOOP, 0);
>> +       callstack_push(ctx, FC_LOOP);
>>          return 0;
>>   }
>>
>> @@ -5737,7 +5781,7 @@ static int tgsi_endloop(struct r600_shader_ctx *ctx)
>>          }
>>          /* XXX add LOOPRET support */
>>          fc_poplevel(ctx);
>> -       callstack_decrease_current(ctx, FC_LOOP);
>> +       callstack_pop(ctx, FC_LOOP);
>>          return 0;
>>   }
>>
>> @@ -5760,7 +5804,6 @@ static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
>>
>>          fc_set_mid(ctx, fscp);
>>
>> -       callstack_check_depth(ctx, FC_PUSH_VPM, 1);
>>          return 0;
>>   }
>>
>> --
>> 1.8.1.2
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/mesa-dev