[Mesa-dev] [PATCH] r600g: don't reserve more stack space than required v4

Sun Mar 31 13:00:28 PDT 2013

Hi Vadim,

Does this patch work ? (It's still not pushed)
I'm working on doing native control flow for llvm and intend to port your patch on the control flow reservation.

Vincent

----- Mail original -----
> De : Vadim Girlin <vadimgirlin at gmail.com>
> À : Alex Deucher <alexdeucher at gmail.com>
> Cc : mesa-dev at lists.freedesktop.org
> Envoyé le : Vendredi 22 février 2013 1h37
> Objet : Re: [Mesa-dev] [PATCH] r600g: don't reserve more stack space than required v4
> 
> On 02/22/2013 04:23 AM, Alex Deucher wrote:
>>  On Thu, Feb 21, 2013 at 6:52 PM, Vadim Girlin <vadimgirlin at gmail.com> 
> wrote:
>>>  v4: implement exact computation taking into account wavefront size
>>> 
>>>  Signed-off-by: Vadim Girlin <vadimgirlin at gmail.com>
>>>  ---
>>>    src/gallium/drivers/r600/r600_asm.c    |  44 +++++++++--
>>>    src/gallium/drivers/r600/r600_asm.h    |  24 ++++--
>>>    src/gallium/drivers/r600/r600_shader.c | 131 
> ++++++++++++++++++++++-----------
>>>    3 files changed, 142 insertions(+), 57 deletions(-)
>>> 
>>>  diff --git a/src/gallium/drivers/r600/r600_asm.c 
> b/src/gallium/drivers/r600/r600_asm.c
>>>  index 3632aa5..f041e27 100644
>>>  --- a/src/gallium/drivers/r600/r600_asm.c
>>>  +++ b/src/gallium/drivers/r600/r600_asm.c
>>>  @@ -86,6 +86,38 @@ static struct r600_bytecode_tex 
> *r600_bytecode_tex(void)
>>>           return tex;
>>>    }
>>> 
>>>  +static unsigned stack_entry_size(enum radeon_family chip) {
>>>  +       /* Wavefront size:
>>>  +        *   64: R600/RV670/RV770/Cypress/R740/Barts/Turks/Caicos/
>>>  +        *       Aruba/Sumo/Sumo2/redwood/juniper
>>>  +        *   32: R630/R730/R710/Palm/Cedar
>>>  +        *   16: R610/Rs780
>>>  +        *
>>>  +        * Stack row size:
>>>  +        *      Wavefront Size                        16  32  48  64
>>>  +        *      Columns per Row (R6xx/R7xx/R8xx only)  8   8   4   4
>>>  +        *      Columns per Row (R9xx+)                8   4   4   4 */
>>>  +
>>>  +       switch (chip) {
>>>  +       /* FIXME: are some chips missing here? */
>>>  +       /* wavefront size 16 */
>>>  +       case CHIP_RV610:
>>>  +       case CHIP_RS780:
>> 
>>  RV620
>>  RS880
>> 
>>  Should be 16 as well.
> 
> Thanks, I'll add them.
> 
> Vadim
> 
>> 
>>>  +       /* wavefront size 32 */
>>>  +       case CHIP_RV630:
>>>  +       case CHIP_RV635:
>>>  +       case CHIP_RV730:
>>>  +       case CHIP_RV710:
>>>  +       case CHIP_PALM:
>>>  +       case CHIP_CEDAR:
>>>  +               return 8;
>>>  +
>>>  +       /* wavefront size 64 */
>>>  +       default:
>>>  +               return 4;
>>>  +       }
>>>  +}
>>>  +
>>>    void r600_bytecode_init(struct r600_bytecode *bc,
>>>                           enum chip_class chip_class,
>>>                           enum radeon_family family,
>>>  @@ -103,6 +135,7 @@ void r600_bytecode_init(struct r600_bytecode *bc,
>>>           LIST_INITHEAD(&bc->cf);
>>>           bc->chip_class = chip_class;
>>>           bc->msaa_texture_mode = msaa_texture_mode;
>>>  +       bc->stack.entry_size = stack_entry_size(family);
>>>    }
>>> 
>>>    static int r600_bytecode_add_cf(struct r600_bytecode *bc)
>>>  @@ -1524,8 +1557,8 @@ int r600_bytecode_build(struct r600_bytecode *bc)
>>>           unsigned addr;
>>>           int i, r;
>>> 
>>>  -       if (bc->callstack[0].max > 0)
>>>  -               bc->nstack = ((bc->callstack[0].max + 3) >> 
> 2) + 2;
>>>  +       bc->nstack = bc->stack.max_entries;
>>>  +
>>>           if (bc->type == TGSI_PROCESSOR_VERTEX && 
> !bc->nstack) {
>>>                   bc->nstack = 1;
>>>           }
>>>  @@ -1826,8 +1859,8 @@ void r600_bytecode_disasm(struct r600_bytecode 
> *bc)
>>>                   chip = '6';
>>>                   break;
>>>           }
>>>  -       fprintf(stderr, "bytecode %d dw -- %d gprs 
> ---------------------\n",
>>>  -               bc->ndw, bc->ngpr);
>>>  +       fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack 
> -------------\n",
>>>  +               bc->ndw, bc->ngpr, bc->nstack);
>>>           fprintf(stderr, "shader %d -- %c\n", index++, 
> chip);
>>> 
>>>           LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
>>>  @@ -2105,7 +2138,8 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
>>>                   chip = '6';
>>>                   break;
>>>           }
>>>  -       fprintf(stderr, "bytecode %d dw -- %d gprs 
> ---------------------\n", bc->ndw, bc->ngpr);
>>>  +       fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack 
> -------------\n",
>>>  +               bc->ndw, bc->ngpr, bc->nstack);
>>>           fprintf(stderr, "     %c\n", chip);
>>> 
>>>           LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
>>>  diff --git a/src/gallium/drivers/r600/r600_asm.h 
> b/src/gallium/drivers/r600/r600_asm.h
>>>  index 03cd238..5a9869d 100644
>>>  --- a/src/gallium/drivers/r600/r600_asm.h
>>>  +++ b/src/gallium/drivers/r600/r600_asm.h
>>>  @@ -173,16 +173,25 @@ struct r600_cf_stack_entry {
>>>    };
>>> 
>>>    #define SQ_MAX_CALL_DEPTH 0x00000020
>>>  -struct r600_cf_callstack {
>>>  -       unsigned                        fc_sp_before_entry;
>>>  -       int                             sub_desc_index;
>>>  -       int                             current;
>>>  -       int                             max;
>>>  -};
>>> 
>>>    #define AR_HANDLE_NORMAL 0
>>>    #define AR_HANDLE_RV6XX 1 /* except RV670 */
>>> 
>>>  +struct r600_stack_info {
>>>  +       /* current level of non-WQM PUSH operations
>>>  +        * (PUSH, PUSH_ELSE, ALU_PUSH_BEFORE) */
>>>  +       int push;
>>>  +       /* current level of WQM PUSH operations
>>>  +        * (PUSH, PUSH_ELSE, PUSH_WQM) */
>>>  +       int push_wqm;
>>>  +       /* current loop level */
>>>  +       int loop;
>>>  +
>>>  +       /* required depth */
>>>  +       int max_entries;
>>>  +       /* subentries per entry */
>>>  +       int entry_size;
>>>  +};
>>> 
>>>    struct r600_bytecode {
>>>           enum chip_class                 chip_class;
>>>  @@ -199,8 +208,7 @@ struct r600_bytecode {
>>>           uint32_t                        *bytecode;
>>>           uint32_t                        fc_sp;
>>>           struct r600_cf_stack_entry      fc_stack[32];
>>>  -       unsigned                        call_sp;
>>>  -       struct r600_cf_callstack        callstack[SQ_MAX_CALL_DEPTH];
>>>  +       struct r600_stack_info          stack;
>>>           unsigned        ar_loaded;
>>>           unsigned        ar_reg;
>>>           unsigned        ar_chan;
>>>  diff --git a/src/gallium/drivers/r600/r600_shader.c 
> b/src/gallium/drivers/r600/r600_shader.c
>>>  index 8642463..404aea7 100644
>>>  --- a/src/gallium/drivers/r600/r600_shader.c
>>>  +++ b/src/gallium/drivers/r600/r600_shader.c
>>>  @@ -234,7 +234,7 @@ struct r600_shader_tgsi_instruction {
>>> 
>>>    static struct r600_shader_tgsi_instruction 
> r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], 
> cm_shader_tgsi_instruction[];
>>>    static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
>>>  -static inline void callstack_check_depth(struct r600_shader_ctx *ctx, 
> unsigned reason, unsigned check_max_only);
>>>  +static inline void callstack_push(struct r600_shader_ctx *ctx, 
> unsigned reason);
>>>    static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
>>>    static int tgsi_else(struct r600_shader_ctx *ctx);
>>>    static int tgsi_endif(struct r600_shader_ctx *ctx);
>>>  @@ -412,7 +412,7 @@ static void llvm_if(struct r600_shader_ctx *ctx)
>>>    {
>>>           r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
>>>           fc_pushlevel(ctx, FC_IF);
>>>  -       callstack_check_depth(ctx, FC_PUSH_VPM, 0);
>>>  +       callstack_push(ctx, FC_PUSH_VPM);
>>>    }
>>> 
>>>    static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx)
>>>  @@ -5522,63 +5522,107 @@ static int pops(struct r600_shader_ctx *ctx, 
> int pops)
>>>           return 0;
>>>    }
>>> 
>>>  -static inline void callstack_decrease_current(struct r600_shader_ctx 
> *ctx, unsigned reason)
>>>  +static inline void callstack_update_max_depth(struct r600_shader_ctx 
> *ctx,
>>>  +                                              unsigned reason)
>>>  +{
>>>  +       struct r600_stack_info *stack = &ctx->bc->stack;
>>>  +       unsigned elements, entries;
>>>  +
>>>  +       unsigned entry_size = stack->entry_size;
>>>  +
>>>  +       elements = (stack->loop + stack->push_wqm ) * entry_size;
>>>  +       elements += stack->push;
>>>  +
>>>  +       switch (ctx->bc->chip_class) {
>>>  +       case R600:
>>>  +       case R700:
>>>  +               /* pre-r8xx: if any non-WQM PUSH instruction is 
> invoked, 2 elements on
>>>  +                * the stack must be reserved to hold the current 
> active/continue
>>>  +                * masks */
>>>  +               if (reason == FC_PUSH_VPM) {
>>>  +                       elements += 2;
>>>  +               }
>>>  +               break;
>>>  +
>>>  +       case CAYMAN:
>>>  +               /* r9xx: any stack operation on empty stack consumes 2 
> additional
>>>  +                * elements */
>>>  +               elements += 2;
>>>  +
>>>  +               /* fallthrough */
>>>  +               /* FIXME: do the two elements added above cover the 
> cases for the
>>>  +                * r8xx+ below? */
>>>  +
>>>  +       case EVERGREEN:
>>>  +               /* r8xx+: 2 extra elements are not always required, but 
> one extra
>>>  +                * element must be added for each of the following 
> cases:
>>>  +                * 1. There is an ALU_ELSE_AFTER instruction at the 
> point of greatest
>>>  +                *    stack usage.
>>>  +                *    (Currently we don't use ALU_ELSE_AFTER.)
>>>  +                * 2. There are LOOP/WQM frames on the stack when any 
> flavor of non-WQM
>>>  +                *    PUSH instruction executed.
>>>  +                *
>>>  +                *    NOTE: it seems we also need to reserve additional 
> element in some
>>>  +                *    other cases, e.g. when we have 4 levels of 
> PUSH_VPM in the shader,
>>>  +                *    then STACK_SIZE should be 2 instead of 1 */
>>>  +               if (reason == FC_PUSH_VPM) {
>>>  +                       elements += 1;
>>>  +               }
>>>  +               break;
>>>  +
>>>  +       default:
>>>  +               assert(0);
>>>  +               break;
>>>  +       }
>>>  +
>>>  +       /* NOTE: it seems STACK_SIZE is interpreted by hw as if 
> entry_size is 4
>>>  +        * for all chips, so we use 4 in the final formula, not the 
> real entry_size
>>>  +        * for the chip */
>>>  +       entry_size = 4;
>>>  +
>>>  +       entries = (elements + (entry_size - 1)) / entry_size;
>>>  +
>>>  +       if (entries > stack->max_entries)
>>>  +               stack->max_entries = entries;
>>>  +}
>>>  +
>>>  +static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned 
> reason)
>>>    {
>>>           switch(reason) {
>>>           case FC_PUSH_VPM:
>>>  -              
> ctx->bc->callstack[ctx->bc->call_sp].current--;
>>>  +               --ctx->bc->stack.push;
>>>  +               assert(ctx->bc->stack.push >= 0);
>>>                   break;
>>>           case FC_PUSH_WQM:
>>>  +               --ctx->bc->stack.push_wqm;
>>>  +               assert(ctx->bc->stack.push_wqm >= 0);
>>>  +               break;
>>>           case FC_LOOP:
>>>  -              
> ctx->bc->callstack[ctx->bc->call_sp].current -= 4;
>>>  +               --ctx->bc->stack.loop;
>>>  +               assert(ctx->bc->stack.loop >= 0);
>>>                   break;
>>>  -       case FC_REP:
>>>  -               /* TOODO : for 16 vp asic should -= 2; */
>>>  -              
> ctx->bc->callstack[ctx->bc->call_sp].current --;
>>>  +       default:
>>>  +               assert(0);
>>>                   break;
>>>           }
>>>    }
>>> 
>>>  -static inline void callstack_check_depth(struct r600_shader_ctx *ctx, 
> unsigned reason, unsigned check_max_only)
>>>  +static inline void callstack_push(struct r600_shader_ctx *ctx, 
> unsigned reason)
>>>    {
>>>  -       if (check_max_only) {
>>>  -               int diff;
>>>  -               switch (reason) {
>>>  -               case FC_PUSH_VPM:
>>>  -                       diff = 1;
>>>  -                       break;
>>>  -               case FC_PUSH_WQM:
>>>  -                       diff = 4;
>>>  -                       break;
>>>  -               default:
>>>  -                       assert(0);
>>>  -                       diff = 0;
>>>  -               }
>>>  -               if 
> ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) >
>>>  -                  
> ctx->bc->callstack[ctx->bc->call_sp].max) {
>>>  -                      
> ctx->bc->callstack[ctx->bc->call_sp].max =
>>>  -                              
> ctx->bc->callstack[ctx->bc->call_sp].current + diff;
>>>  -               }
>>>  -               return;
>>>  -       }
>>>           switch (reason) {
>>>           case FC_PUSH_VPM:
>>>  -              
> ctx->bc->callstack[ctx->bc->call_sp].current++;
>>>  +               ++ctx->bc->stack.push;
>>>                   break;
>>>           case FC_PUSH_WQM:
>>>  +               ++ctx->bc->stack.push_wqm;
>>>           case FC_LOOP:
>>>  -              
> ctx->bc->callstack[ctx->bc->call_sp].current += 4;
>>>  -               break;
>>>  -       case FC_REP:
>>>  -              
> ctx->bc->callstack[ctx->bc->call_sp].current++;
>>>  +               ++ctx->bc->stack.loop;
>>>                   break;
>>>  +       default:
>>>  +               assert(0);
>>>           }
>>> 
>>>  -       if ((ctx->bc->callstack[ctx->bc->call_sp].current) 
>> 
>>>  -           ctx->bc->callstack[ctx->bc->call_sp].max) {
>>>  -               ctx->bc->callstack[ctx->bc->call_sp].max =
>>>  -                      
> ctx->bc->callstack[ctx->bc->call_sp].current;
>>>  -       }
>>>  +       callstack_update_max_depth(ctx, reason);
>>>    }
>>> 
>>>    static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
>>>  @@ -5665,7 +5709,7 @@ static int tgsi_if(struct r600_shader_ctx *ctx)
>>> 
>>>           fc_pushlevel(ctx, FC_IF);
>>> 
>>>  -       callstack_check_depth(ctx, FC_PUSH_VPM, 0);
>>>  +       callstack_push(ctx, FC_PUSH_VPM);
>>>           return 0;
>>>    }
>>> 
>>>  @@ -5695,7 +5739,7 @@ static int tgsi_endif(struct r600_shader_ctx 
> *ctx)
>>>           }
>>>           fc_poplevel(ctx);
>>> 
>>>  -       callstack_decrease_current(ctx, FC_PUSH_VPM);
>>>  +       callstack_pop(ctx, FC_PUSH_VPM);
>>>           return 0;
>>>    }
>>> 
>>>  @@ -5708,7 +5752,7 @@ static int tgsi_bgnloop(struct r600_shader_ctx 
> *ctx)
>>>           fc_pushlevel(ctx, FC_LOOP);
>>> 
>>>           /* check stack depth */
>>>  -       callstack_check_depth(ctx, FC_LOOP, 0);
>>>  +       callstack_push(ctx, FC_LOOP);
>>>           return 0;
>>>    }
>>> 
>>>  @@ -5737,7 +5781,7 @@ static int tgsi_endloop(struct r600_shader_ctx 
> *ctx)
>>>           }
>>>           /* XXX add LOOPRET support */
>>>           fc_poplevel(ctx);
>>>  -       callstack_decrease_current(ctx, FC_LOOP);
>>>  +       callstack_pop(ctx, FC_LOOP);
>>>           return 0;
>>>    }
>>> 
>>>  @@ -5760,7 +5804,6 @@ static int tgsi_loop_brk_cont(struct 
> r600_shader_ctx *ctx)
>>> 
>>>           fc_set_mid(ctx, fscp);
>>> 
>>>  -       callstack_check_depth(ctx, FC_PUSH_VPM, 1);
>>>           return 0;
>>>    }
>>> 
>>>  --
>>>  1.8.1.2
>>> 
>>>  _______________________________________________
>>>  mesa-dev mailing list
>>>  mesa-dev at lists.freedesktop.org
>>>  http://lists.freedesktop.org/mailman/listinfo/mesa-dev
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
>