[Mesa-dev] [PATCH] r600g: don't reserve more stack space than required v4
Vincent Lejeune
vljn at ovi.com
Sun Mar 31 13:00:28 PDT 2013
Hi Vadim,
Does this patch work ? (It's still not pushed)
I'm working on doing native control flow for llvm and intend to port your patch on the control flow reservation.
Vincent
----- Mail original -----
> De : Vadim Girlin <vadimgirlin at gmail.com>
> À : Alex Deucher <alexdeucher at gmail.com>
> Cc : mesa-dev at lists.freedesktop.org
> Envoyé le : Vendredi 22 février 2013 1h37
> Objet : Re: [Mesa-dev] [PATCH] r600g: don't reserve more stack space than required v4
>
> On 02/22/2013 04:23 AM, Alex Deucher wrote:
>> On Thu, Feb 21, 2013 at 6:52 PM, Vadim Girlin <vadimgirlin at gmail.com>
> wrote:
>>> v4: implement exact computation taking into account wavefront size
>>>
>>> Signed-off-by: Vadim Girlin <vadimgirlin at gmail.com>
>>> ---
>>> src/gallium/drivers/r600/r600_asm.c | 44 +++++++++--
>>> src/gallium/drivers/r600/r600_asm.h | 24 ++++--
>>> src/gallium/drivers/r600/r600_shader.c | 131
> ++++++++++++++++++++++-----------
>>> 3 files changed, 142 insertions(+), 57 deletions(-)
>>>
>>> diff --git a/src/gallium/drivers/r600/r600_asm.c
> b/src/gallium/drivers/r600/r600_asm.c
>>> index 3632aa5..f041e27 100644
>>> --- a/src/gallium/drivers/r600/r600_asm.c
>>> +++ b/src/gallium/drivers/r600/r600_asm.c
>>> @@ -86,6 +86,38 @@ static struct r600_bytecode_tex
> *r600_bytecode_tex(void)
>>> return tex;
>>> }
>>>
>>> +static unsigned stack_entry_size(enum radeon_family chip) {
>>> + /* Wavefront size:
>>> + * 64: R600/RV670/RV770/Cypress/R740/Barts/Turks/Caicos/
>>> + * Aruba/Sumo/Sumo2/redwood/juniper
>>> + * 32: R630/R730/R710/Palm/Cedar
>>> + * 16: R610/Rs780
>>> + *
>>> + * Stack row size:
>>> + * Wavefront Size 16 32 48 64
>>> + * Columns per Row (R6xx/R7xx/R8xx only) 8 8 4 4
>>> + * Columns per Row (R9xx+) 8 4 4 4 */
>>> +
>>> + switch (chip) {
>>> + /* FIXME: are some chips missing here? */
>>> + /* wavefront size 16 */
>>> + case CHIP_RV610:
>>> + case CHIP_RS780:
>>
>> RV620
>> RS880
>>
>> Should be 16 as well.
>
> Thanks, I'll add them.
>
> Vadim
>
>>
>>> + /* wavefront size 32 */
>>> + case CHIP_RV630:
>>> + case CHIP_RV635:
>>> + case CHIP_RV730:
>>> + case CHIP_RV710:
>>> + case CHIP_PALM:
>>> + case CHIP_CEDAR:
>>> + return 8;
>>> +
>>> + /* wavefront size 64 */
>>> + default:
>>> + return 4;
>>> + }
>>> +}
>>> +
>>> void r600_bytecode_init(struct r600_bytecode *bc,
>>> enum chip_class chip_class,
>>> enum radeon_family family,
>>> @@ -103,6 +135,7 @@ void r600_bytecode_init(struct r600_bytecode *bc,
>>> LIST_INITHEAD(&bc->cf);
>>> bc->chip_class = chip_class;
>>> bc->msaa_texture_mode = msaa_texture_mode;
>>> + bc->stack.entry_size = stack_entry_size(family);
>>> }
>>>
>>> static int r600_bytecode_add_cf(struct r600_bytecode *bc)
>>> @@ -1524,8 +1557,8 @@ int r600_bytecode_build(struct r600_bytecode *bc)
>>> unsigned addr;
>>> int i, r;
>>>
>>> - if (bc->callstack[0].max > 0)
>>> - bc->nstack = ((bc->callstack[0].max + 3) >>
> 2) + 2;
>>> + bc->nstack = bc->stack.max_entries;
>>> +
>>> if (bc->type == TGSI_PROCESSOR_VERTEX &&
> !bc->nstack) {
>>> bc->nstack = 1;
>>> }
>>> @@ -1826,8 +1859,8 @@ void r600_bytecode_disasm(struct r600_bytecode
> *bc)
>>> chip = '6';
>>> break;
>>> }
>>> - fprintf(stderr, "bytecode %d dw -- %d gprs
> ---------------------\n",
>>> - bc->ndw, bc->ngpr);
>>> + fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack
> -------------\n",
>>> + bc->ndw, bc->ngpr, bc->nstack);
>>> fprintf(stderr, "shader %d -- %c\n", index++,
> chip);
>>>
>>> LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
>>> @@ -2105,7 +2138,8 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
>>> chip = '6';
>>> break;
>>> }
>>> - fprintf(stderr, "bytecode %d dw -- %d gprs
> ---------------------\n", bc->ndw, bc->ngpr);
>>> + fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack
> -------------\n",
>>> + bc->ndw, bc->ngpr, bc->nstack);
>>> fprintf(stderr, " %c\n", chip);
>>>
>>> LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
>>> diff --git a/src/gallium/drivers/r600/r600_asm.h
> b/src/gallium/drivers/r600/r600_asm.h
>>> index 03cd238..5a9869d 100644
>>> --- a/src/gallium/drivers/r600/r600_asm.h
>>> +++ b/src/gallium/drivers/r600/r600_asm.h
>>> @@ -173,16 +173,25 @@ struct r600_cf_stack_entry {
>>> };
>>>
>>> #define SQ_MAX_CALL_DEPTH 0x00000020
>>> -struct r600_cf_callstack {
>>> - unsigned fc_sp_before_entry;
>>> - int sub_desc_index;
>>> - int current;
>>> - int max;
>>> -};
>>>
>>> #define AR_HANDLE_NORMAL 0
>>> #define AR_HANDLE_RV6XX 1 /* except RV670 */
>>>
>>> +struct r600_stack_info {
>>> + /* current level of non-WQM PUSH operations
>>> + * (PUSH, PUSH_ELSE, ALU_PUSH_BEFORE) */
>>> + int push;
>>> + /* current level of WQM PUSH operations
>>> + * (PUSH, PUSH_ELSE, PUSH_WQM) */
>>> + int push_wqm;
>>> + /* current loop level */
>>> + int loop;
>>> +
>>> + /* required depth */
>>> + int max_entries;
>>> + /* subentries per entry */
>>> + int entry_size;
>>> +};
>>>
>>> struct r600_bytecode {
>>> enum chip_class chip_class;
>>> @@ -199,8 +208,7 @@ struct r600_bytecode {
>>> uint32_t *bytecode;
>>> uint32_t fc_sp;
>>> struct r600_cf_stack_entry fc_stack[32];
>>> - unsigned call_sp;
>>> - struct r600_cf_callstack callstack[SQ_MAX_CALL_DEPTH];
>>> + struct r600_stack_info stack;
>>> unsigned ar_loaded;
>>> unsigned ar_reg;
>>> unsigned ar_chan;
>>> diff --git a/src/gallium/drivers/r600/r600_shader.c
> b/src/gallium/drivers/r600/r600_shader.c
>>> index 8642463..404aea7 100644
>>> --- a/src/gallium/drivers/r600/r600_shader.c
>>> +++ b/src/gallium/drivers/r600/r600_shader.c
>>> @@ -234,7 +234,7 @@ struct r600_shader_tgsi_instruction {
>>>
>>> static struct r600_shader_tgsi_instruction
> r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[],
> cm_shader_tgsi_instruction[];
>>> static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
>>> -static inline void callstack_check_depth(struct r600_shader_ctx *ctx,
> unsigned reason, unsigned check_max_only);
>>> +static inline void callstack_push(struct r600_shader_ctx *ctx,
> unsigned reason);
>>> static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
>>> static int tgsi_else(struct r600_shader_ctx *ctx);
>>> static int tgsi_endif(struct r600_shader_ctx *ctx);
>>> @@ -412,7 +412,7 @@ static void llvm_if(struct r600_shader_ctx *ctx)
>>> {
>>> r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
>>> fc_pushlevel(ctx, FC_IF);
>>> - callstack_check_depth(ctx, FC_PUSH_VPM, 0);
>>> + callstack_push(ctx, FC_PUSH_VPM);
>>> }
>>>
>>> static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx)
>>> @@ -5522,63 +5522,107 @@ static int pops(struct r600_shader_ctx *ctx,
> int pops)
>>> return 0;
>>> }
>>>
>>> -static inline void callstack_decrease_current(struct r600_shader_ctx
> *ctx, unsigned reason)
>>> +static inline void callstack_update_max_depth(struct r600_shader_ctx
> *ctx,
>>> + unsigned reason)
>>> +{
>>> + struct r600_stack_info *stack = &ctx->bc->stack;
>>> + unsigned elements, entries;
>>> +
>>> + unsigned entry_size = stack->entry_size;
>>> +
>>> + elements = (stack->loop + stack->push_wqm ) * entry_size;
>>> + elements += stack->push;
>>> +
>>> + switch (ctx->bc->chip_class) {
>>> + case R600:
>>> + case R700:
>>> + /* pre-r8xx: if any non-WQM PUSH instruction is
> invoked, 2 elements on
>>> + * the stack must be reserved to hold the current
> active/continue
>>> + * masks */
>>> + if (reason == FC_PUSH_VPM) {
>>> + elements += 2;
>>> + }
>>> + break;
>>> +
>>> + case CAYMAN:
>>> + /* r9xx: any stack operation on empty stack consumes 2
> additional
>>> + * elements */
>>> + elements += 2;
>>> +
>>> + /* fallthrough */
>>> + /* FIXME: do the two elements added above cover the
> cases for the
>>> + * r8xx+ below? */
>>> +
>>> + case EVERGREEN:
>>> + /* r8xx+: 2 extra elements are not always required, but
> one extra
>>> + * element must be added for each of the following
> cases:
>>> + * 1. There is an ALU_ELSE_AFTER instruction at the
> point of greatest
>>> + * stack usage.
>>> + * (Currently we don't use ALU_ELSE_AFTER.)
>>> + * 2. There are LOOP/WQM frames on the stack when any
> flavor of non-WQM
>>> + * PUSH instruction executed.
>>> + *
>>> + * NOTE: it seems we also need to reserve additional
> element in some
>>> + * other cases, e.g. when we have 4 levels of
> PUSH_VPM in the shader,
>>> + * then STACK_SIZE should be 2 instead of 1 */
>>> + if (reason == FC_PUSH_VPM) {
>>> + elements += 1;
>>> + }
>>> + break;
>>> +
>>> + default:
>>> + assert(0);
>>> + break;
>>> + }
>>> +
>>> + /* NOTE: it seems STACK_SIZE is interpreted by hw as if
> entry_size is 4
>>> + * for all chips, so we use 4 in the final formula, not the
> real entry_size
>>> + * for the chip */
>>> + entry_size = 4;
>>> +
>>> + entries = (elements + (entry_size - 1)) / entry_size;
>>> +
>>> + if (entries > stack->max_entries)
>>> + stack->max_entries = entries;
>>> +}
>>> +
>>> +static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned
> reason)
>>> {
>>> switch(reason) {
>>> case FC_PUSH_VPM:
>>> -
> ctx->bc->callstack[ctx->bc->call_sp].current--;
>>> + --ctx->bc->stack.push;
>>> + assert(ctx->bc->stack.push >= 0);
>>> break;
>>> case FC_PUSH_WQM:
>>> + --ctx->bc->stack.push_wqm;
>>> + assert(ctx->bc->stack.push_wqm >= 0);
>>> + break;
>>> case FC_LOOP:
>>> -
> ctx->bc->callstack[ctx->bc->call_sp].current -= 4;
>>> + --ctx->bc->stack.loop;
>>> + assert(ctx->bc->stack.loop >= 0);
>>> break;
>>> - case FC_REP:
>>> - /* TOODO : for 16 vp asic should -= 2; */
>>> -
> ctx->bc->callstack[ctx->bc->call_sp].current --;
>>> + default:
>>> + assert(0);
>>> break;
>>> }
>>> }
>>>
>>> -static inline void callstack_check_depth(struct r600_shader_ctx *ctx,
> unsigned reason, unsigned check_max_only)
>>> +static inline void callstack_push(struct r600_shader_ctx *ctx,
> unsigned reason)
>>> {
>>> - if (check_max_only) {
>>> - int diff;
>>> - switch (reason) {
>>> - case FC_PUSH_VPM:
>>> - diff = 1;
>>> - break;
>>> - case FC_PUSH_WQM:
>>> - diff = 4;
>>> - break;
>>> - default:
>>> - assert(0);
>>> - diff = 0;
>>> - }
>>> - if
> ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) >
>>> -
> ctx->bc->callstack[ctx->bc->call_sp].max) {
>>> -
> ctx->bc->callstack[ctx->bc->call_sp].max =
>>> -
> ctx->bc->callstack[ctx->bc->call_sp].current + diff;
>>> - }
>>> - return;
>>> - }
>>> switch (reason) {
>>> case FC_PUSH_VPM:
>>> -
> ctx->bc->callstack[ctx->bc->call_sp].current++;
>>> + ++ctx->bc->stack.push;
>>> break;
>>> case FC_PUSH_WQM:
>>> + ++ctx->bc->stack.push_wqm;
>>> case FC_LOOP:
>>> -
> ctx->bc->callstack[ctx->bc->call_sp].current += 4;
>>> - break;
>>> - case FC_REP:
>>> -
> ctx->bc->callstack[ctx->bc->call_sp].current++;
>>> + ++ctx->bc->stack.loop;
>>> break;
>>> + default:
>>> + assert(0);
>>> }
>>>
>>> - if ((ctx->bc->callstack[ctx->bc->call_sp].current)
>>
>>> - ctx->bc->callstack[ctx->bc->call_sp].max) {
>>> - ctx->bc->callstack[ctx->bc->call_sp].max =
>>> -
> ctx->bc->callstack[ctx->bc->call_sp].current;
>>> - }
>>> + callstack_update_max_depth(ctx, reason);
>>> }
>>>
>>> static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
>>> @@ -5665,7 +5709,7 @@ static int tgsi_if(struct r600_shader_ctx *ctx)
>>>
>>> fc_pushlevel(ctx, FC_IF);
>>>
>>> - callstack_check_depth(ctx, FC_PUSH_VPM, 0);
>>> + callstack_push(ctx, FC_PUSH_VPM);
>>> return 0;
>>> }
>>>
>>> @@ -5695,7 +5739,7 @@ static int tgsi_endif(struct r600_shader_ctx
> *ctx)
>>> }
>>> fc_poplevel(ctx);
>>>
>>> - callstack_decrease_current(ctx, FC_PUSH_VPM);
>>> + callstack_pop(ctx, FC_PUSH_VPM);
>>> return 0;
>>> }
>>>
>>> @@ -5708,7 +5752,7 @@ static int tgsi_bgnloop(struct r600_shader_ctx
> *ctx)
>>> fc_pushlevel(ctx, FC_LOOP);
>>>
>>> /* check stack depth */
>>> - callstack_check_depth(ctx, FC_LOOP, 0);
>>> + callstack_push(ctx, FC_LOOP);
>>> return 0;
>>> }
>>>
>>> @@ -5737,7 +5781,7 @@ static int tgsi_endloop(struct r600_shader_ctx
> *ctx)
>>> }
>>> /* XXX add LOOPRET support */
>>> fc_poplevel(ctx);
>>> - callstack_decrease_current(ctx, FC_LOOP);
>>> + callstack_pop(ctx, FC_LOOP);
>>> return 0;
>>> }
>>>
>>> @@ -5760,7 +5804,6 @@ static int tgsi_loop_brk_cont(struct
> r600_shader_ctx *ctx)
>>>
>>> fc_set_mid(ctx, fscp);
>>>
>>> - callstack_check_depth(ctx, FC_PUSH_VPM, 1);
>>> return 0;
>>> }
>>>
>>> --
>>> 1.8.1.2
>>>
>>> _______________________________________________
>>> mesa-dev mailing list
>>> mesa-dev at lists.freedesktop.org
>>> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
More information about the mesa-dev
mailing list