[Mesa-dev] [PATCH] r600g: don't reserve more stack space than required v4
Alex Deucher
alexdeucher at gmail.com
Thu Feb 21 16:23:18 PST 2013
On Thu, Feb 21, 2013 at 6:52 PM, Vadim Girlin <vadimgirlin at gmail.com> wrote:
> v4: implement exact computation taking into account wavefront size
>
> Signed-off-by: Vadim Girlin <vadimgirlin at gmail.com>
> ---
> src/gallium/drivers/r600/r600_asm.c | 44 +++++++++--
> src/gallium/drivers/r600/r600_asm.h | 24 ++++--
> src/gallium/drivers/r600/r600_shader.c | 131 ++++++++++++++++++++++-----------
> 3 files changed, 142 insertions(+), 57 deletions(-)
>
> diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
> index 3632aa5..f041e27 100644
> --- a/src/gallium/drivers/r600/r600_asm.c
> +++ b/src/gallium/drivers/r600/r600_asm.c
> @@ -86,6 +86,38 @@ static struct r600_bytecode_tex *r600_bytecode_tex(void)
> return tex;
> }
>
> +static unsigned stack_entry_size(enum radeon_family chip) {
> + /* Wavefront size:
> + * 64: R600/RV670/RV770/Cypress/R740/Barts/Turks/Caicos/
> + * Aruba/Sumo/Sumo2/redwood/juniper
> + * 32: R630/R730/R710/Palm/Cedar
> + * 16: R610/Rs780
> + *
> + * Stack row size:
> + * Wavefront Size 16 32 48 64
> + * Columns per Row (R6xx/R7xx/R8xx only) 8 8 4 4
> + * Columns per Row (R9xx+) 8 4 4 4 */
> +
> + switch (chip) {
> + /* FIXME: are some chips missing here? */
> + /* wavefront size 16 */
> + case CHIP_RV610:
> + case CHIP_RS780:
RV620
RS880
Should be 16 as well.
> + /* wavefront size 32 */
> + case CHIP_RV630:
> + case CHIP_RV635:
> + case CHIP_RV730:
> + case CHIP_RV710:
> + case CHIP_PALM:
> + case CHIP_CEDAR:
> + return 8;
> +
> + /* wavefront size 64 */
> + default:
> + return 4;
> + }
> +}
> +
> void r600_bytecode_init(struct r600_bytecode *bc,
> enum chip_class chip_class,
> enum radeon_family family,
> @@ -103,6 +135,7 @@ void r600_bytecode_init(struct r600_bytecode *bc,
> LIST_INITHEAD(&bc->cf);
> bc->chip_class = chip_class;
> bc->msaa_texture_mode = msaa_texture_mode;
> + bc->stack.entry_size = stack_entry_size(family);
> }
>
> static int r600_bytecode_add_cf(struct r600_bytecode *bc)
> @@ -1524,8 +1557,8 @@ int r600_bytecode_build(struct r600_bytecode *bc)
> unsigned addr;
> int i, r;
>
> - if (bc->callstack[0].max > 0)
> - bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2;
> + bc->nstack = bc->stack.max_entries;
> +
> if (bc->type == TGSI_PROCESSOR_VERTEX && !bc->nstack) {
> bc->nstack = 1;
> }
> @@ -1826,8 +1859,8 @@ void r600_bytecode_disasm(struct r600_bytecode *bc)
> chip = '6';
> break;
> }
> - fprintf(stderr, "bytecode %d dw -- %d gprs ---------------------\n",
> - bc->ndw, bc->ngpr);
> + fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack -------------\n",
> + bc->ndw, bc->ngpr, bc->nstack);
> fprintf(stderr, "shader %d -- %c\n", index++, chip);
>
> LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
> @@ -2105,7 +2138,8 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
> chip = '6';
> break;
> }
> - fprintf(stderr, "bytecode %d dw -- %d gprs ---------------------\n", bc->ndw, bc->ngpr);
> + fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack -------------\n",
> + bc->ndw, bc->ngpr, bc->nstack);
> fprintf(stderr, " %c\n", chip);
>
> LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
> diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
> index 03cd238..5a9869d 100644
> --- a/src/gallium/drivers/r600/r600_asm.h
> +++ b/src/gallium/drivers/r600/r600_asm.h
> @@ -173,16 +173,25 @@ struct r600_cf_stack_entry {
> };
>
> #define SQ_MAX_CALL_DEPTH 0x00000020
> -struct r600_cf_callstack {
> - unsigned fc_sp_before_entry;
> - int sub_desc_index;
> - int current;
> - int max;
> -};
>
> #define AR_HANDLE_NORMAL 0
> #define AR_HANDLE_RV6XX 1 /* except RV670 */
>
> +struct r600_stack_info {
> + /* current level of non-WQM PUSH operations
> + * (PUSH, PUSH_ELSE, ALU_PUSH_BEFORE) */
> + int push;
> + /* current level of WQM PUSH operations
> + * (PUSH, PUSH_ELSE, PUSH_WQM) */
> + int push_wqm;
> + /* current loop level */
> + int loop;
> +
> + /* required depth */
> + int max_entries;
> + /* subentries per entry */
> + int entry_size;
> +};
>
> struct r600_bytecode {
> enum chip_class chip_class;
> @@ -199,8 +208,7 @@ struct r600_bytecode {
> uint32_t *bytecode;
> uint32_t fc_sp;
> struct r600_cf_stack_entry fc_stack[32];
> - unsigned call_sp;
> - struct r600_cf_callstack callstack[SQ_MAX_CALL_DEPTH];
> + struct r600_stack_info stack;
> unsigned ar_loaded;
> unsigned ar_reg;
> unsigned ar_chan;
> diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
> index 8642463..404aea7 100644
> --- a/src/gallium/drivers/r600/r600_shader.c
> +++ b/src/gallium/drivers/r600/r600_shader.c
> @@ -234,7 +234,7 @@ struct r600_shader_tgsi_instruction {
>
> static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
> static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
> -static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only);
> +static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
> static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
> static int tgsi_else(struct r600_shader_ctx *ctx);
> static int tgsi_endif(struct r600_shader_ctx *ctx);
> @@ -412,7 +412,7 @@ static void llvm_if(struct r600_shader_ctx *ctx)
> {
> r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
> fc_pushlevel(ctx, FC_IF);
> - callstack_check_depth(ctx, FC_PUSH_VPM, 0);
> + callstack_push(ctx, FC_PUSH_VPM);
> }
>
> static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx)
> @@ -5522,63 +5522,107 @@ static int pops(struct r600_shader_ctx *ctx, int pops)
> return 0;
> }
>
> -static inline void callstack_decrease_current(struct r600_shader_ctx *ctx, unsigned reason)
> +static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
> + unsigned reason)
> +{
> + struct r600_stack_info *stack = &ctx->bc->stack;
> + unsigned elements, entries;
> +
> + unsigned entry_size = stack->entry_size;
> +
> + elements = (stack->loop + stack->push_wqm ) * entry_size;
> + elements += stack->push;
> +
> + switch (ctx->bc->chip_class) {
> + case R600:
> + case R700:
> + /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
> + * the stack must be reserved to hold the current active/continue
> + * masks */
> + if (reason == FC_PUSH_VPM) {
> + elements += 2;
> + }
> + break;
> +
> + case CAYMAN:
> + /* r9xx: any stack operation on empty stack consumes 2 additional
> + * elements */
> + elements += 2;
> +
> + /* fallthrough */
> + /* FIXME: do the two elements added above cover the cases for the
> + * r8xx+ below? */
> +
> + case EVERGREEN:
> + /* r8xx+: 2 extra elements are not always required, but one extra
> + * element must be added for each of the following cases:
> + * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
> + * stack usage.
> + * (Currently we don't use ALU_ELSE_AFTER.)
> + * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
> + * PUSH instruction executed.
> + *
> + * NOTE: it seems we also need to reserve additional element in some
> + * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
> + * then STACK_SIZE should be 2 instead of 1 */
> + if (reason == FC_PUSH_VPM) {
> + elements += 1;
> + }
> + break;
> +
> + default:
> + assert(0);
> + break;
> + }
> +
> + /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
> + * for all chips, so we use 4 in the final formula, not the real entry_size
> + * for the chip */
> + entry_size = 4;
> +
> + entries = (elements + (entry_size - 1)) / entry_size;
> +
> + if (entries > stack->max_entries)
> + stack->max_entries = entries;
> +}
> +
> +static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
> {
> switch(reason) {
> case FC_PUSH_VPM:
> - ctx->bc->callstack[ctx->bc->call_sp].current--;
> + --ctx->bc->stack.push;
> + assert(ctx->bc->stack.push >= 0);
> break;
> case FC_PUSH_WQM:
> + --ctx->bc->stack.push_wqm;
> + assert(ctx->bc->stack.push_wqm >= 0);
> + break;
> case FC_LOOP:
> - ctx->bc->callstack[ctx->bc->call_sp].current -= 4;
> + --ctx->bc->stack.loop;
> + assert(ctx->bc->stack.loop >= 0);
> break;
> - case FC_REP:
> - /* TOODO : for 16 vp asic should -= 2; */
> - ctx->bc->callstack[ctx->bc->call_sp].current --;
> + default:
> + assert(0);
> break;
> }
> }
>
> -static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only)
> +static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
> {
> - if (check_max_only) {
> - int diff;
> - switch (reason) {
> - case FC_PUSH_VPM:
> - diff = 1;
> - break;
> - case FC_PUSH_WQM:
> - diff = 4;
> - break;
> - default:
> - assert(0);
> - diff = 0;
> - }
> - if ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) >
> - ctx->bc->callstack[ctx->bc->call_sp].max) {
> - ctx->bc->callstack[ctx->bc->call_sp].max =
> - ctx->bc->callstack[ctx->bc->call_sp].current + diff;
> - }
> - return;
> - }
> switch (reason) {
> case FC_PUSH_VPM:
> - ctx->bc->callstack[ctx->bc->call_sp].current++;
> + ++ctx->bc->stack.push;
> break;
> case FC_PUSH_WQM:
> + ++ctx->bc->stack.push_wqm;
> case FC_LOOP:
> - ctx->bc->callstack[ctx->bc->call_sp].current += 4;
> - break;
> - case FC_REP:
> - ctx->bc->callstack[ctx->bc->call_sp].current++;
> + ++ctx->bc->stack.loop;
> break;
> + default:
> + assert(0);
> }
>
> - if ((ctx->bc->callstack[ctx->bc->call_sp].current) >
> - ctx->bc->callstack[ctx->bc->call_sp].max) {
> - ctx->bc->callstack[ctx->bc->call_sp].max =
> - ctx->bc->callstack[ctx->bc->call_sp].current;
> - }
> + callstack_update_max_depth(ctx, reason);
> }
>
> static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
> @@ -5665,7 +5709,7 @@ static int tgsi_if(struct r600_shader_ctx *ctx)
>
> fc_pushlevel(ctx, FC_IF);
>
> - callstack_check_depth(ctx, FC_PUSH_VPM, 0);
> + callstack_push(ctx, FC_PUSH_VPM);
> return 0;
> }
>
> @@ -5695,7 +5739,7 @@ static int tgsi_endif(struct r600_shader_ctx *ctx)
> }
> fc_poplevel(ctx);
>
> - callstack_decrease_current(ctx, FC_PUSH_VPM);
> + callstack_pop(ctx, FC_PUSH_VPM);
> return 0;
> }
>
> @@ -5708,7 +5752,7 @@ static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
> fc_pushlevel(ctx, FC_LOOP);
>
> /* check stack depth */
> - callstack_check_depth(ctx, FC_LOOP, 0);
> + callstack_push(ctx, FC_LOOP);
> return 0;
> }
>
> @@ -5737,7 +5781,7 @@ static int tgsi_endloop(struct r600_shader_ctx *ctx)
> }
> /* XXX add LOOPRET support */
> fc_poplevel(ctx);
> - callstack_decrease_current(ctx, FC_LOOP);
> + callstack_pop(ctx, FC_LOOP);
> return 0;
> }
>
> @@ -5760,7 +5804,6 @@ static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
>
> fc_set_mid(ctx, fscp);
>
> - callstack_check_depth(ctx, FC_PUSH_VPM, 1);
> return 0;
> }
>
> --
> 1.8.1.2
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list