[Mesa-dev] [PATCH 2/9] i965/fs: Jump from discard statements to the end of the program when done.
Kenneth Graunke
kenneth at whitecape.org
Thu Mar 15 13:16:33 PDT 2012
On 03/12/2012 06:08 PM, Eric Anholt wrote:
> From the GLSL 1.30 spec:
>
> The discard keyword is only allowed within fragment shaders. It
> can be used within a fragment shader to abandon the operation on
> the current fragment. This keyword causes the fragment to be
> discarded and no updates to any buffers will occur. Control flow
> exits the shader, and subsequent implicit or explicit derivatives
> are undefined when this control flow is non-uniform (meaning
> different fragments within the primitive take different control
> paths).
> ---
> src/mesa/drivers/dri/i965/brw_eu.h | 1 +
> src/mesa/drivers/dri/i965/brw_eu_emit.c | 53 +++++++++++++++++++++++++---
> src/mesa/drivers/dri/i965/brw_fs.h | 22 +++++++++++
> src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 55 +++++++++++++++++++++++++++++
> 4 files changed, 126 insertions(+), 5 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
> index dbc84be..373f2ab 100644
> --- a/src/mesa/drivers/dri/i965/brw_eu.h
> +++ b/src/mesa/drivers/dri/i965/brw_eu.h
> @@ -1063,6 +1063,7 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p);
> struct brw_instruction *brw_BREAK(struct brw_compile *p);
> struct brw_instruction *brw_CONT(struct brw_compile *p);
> struct brw_instruction *gen6_CONT(struct brw_compile *p);
> +struct brw_instruction *gen6_HALT(struct brw_compile *p);
> /* Forward jumps:
> */
> void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx);
> diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
> index 179b59a..597157e 100644
> --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
> +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
> @@ -1421,6 +1421,20 @@ struct brw_instruction *brw_CONT(struct brw_compile *p)
> return insn;
> }
>
> +struct brw_instruction *gen6_HALT(struct brw_compile *p)
> +{
> + struct brw_instruction *insn;
> +
> + insn = next_insn(p, BRW_OPCODE_HALT);
> + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
> + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
> + brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
> +
> + insn->header.compression_control = BRW_COMPRESSION_NONE;
> + insn->header.execution_size = BRW_EXECUTE_8;
> + return insn;
> +}
> +
> /* DO/WHILE loop:
> *
> * The DO/WHILE is just an unterminated loop -- break or continue are
> @@ -2477,8 +2491,8 @@ brw_find_next_block_end(struct brw_compile *p, int start)
> return ip;
> }
> }
> - assert(!"not reached");
> - return start + 1;
> +
> + return 0;
> }
>
> /* There is no DO instruction on gen6, so to find the end of the loop
> @@ -2507,7 +2521,7 @@ brw_find_loop_end(struct brw_compile *p, int start)
> }
>
> /* After program generation, go back and update the UIP and JIP of
> - * BREAK and CONT instructions to their correct locations.
> + * BREAK, CONT, and HALT instructions to their correct locations.
> */
> void
> brw_set_uip_jip(struct brw_compile *p)
> @@ -2521,21 +2535,50 @@ brw_set_uip_jip(struct brw_compile *p)
>
> for (ip = 0; ip< p->nr_insn; ip++) {
> struct brw_instruction *insn =&p->store[ip];
> + int block_end_ip = 0;
> +
> + if (insn->header.opcode == BRW_OPCODE_BREAK ||
> + insn->header.opcode == BRW_OPCODE_CONTINUE ||
> + insn->header.opcode == BRW_OPCODE_HALT) {
> + block_end_ip = brw_find_next_block_end(p, ip);
> + }
>
> switch (insn->header.opcode) {
> case BRW_OPCODE_BREAK:
> - insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
> + assert(block_end_ip != 0);
> + insn->bits3.break_cont.jip = br * (block_end_ip - ip);
> /* Gen7 UIP points to WHILE; Gen6 points just after it */
> insn->bits3.break_cont.uip =
> br * (brw_find_loop_end(p, ip) - ip + (intel->gen == 6 ? 1 : 0));
> break;
> case BRW_OPCODE_CONTINUE:
> - insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
> + assert(block_end_ip != 0);
> + insn->bits3.break_cont.jip = br * (block_end_ip - ip);
> insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
>
> assert(insn->bits3.break_cont.uip != 0);
> assert(insn->bits3.break_cont.jip != 0);
> break;
> + case BRW_OPCODE_HALT:
> + /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
> + *
> + * "In case of the halt instruction not inside any conditional code
> + * block, the value of<JIP> and<UIP> should be the same. In case
> + * of the halt instruction inside conditional code block, the<UIP>
> + * should be the end of the program, and the<JIP> should be end of
> + * the most inner conditional code block."
> + *
> + * The uip will have already been set by whoever set up the
> + * instruction.
> + */
> + if (block_end_ip == 0) {
> + insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
> + } else {
> + insn->bits3.break_cont.jip = br * (block_end_ip - ip);
> + }
> + assert(insn->bits3.break_cont.uip != 0);
> + assert(insn->bits3.break_cont.jip != 0);
> + break;
> }
> }
> }
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index 0a37b39..7aebffa 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -171,6 +171,26 @@ static const fs_reg reg_undef;
> static const fs_reg reg_null_f(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_F);
> static const fs_reg reg_null_d(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_D);
>
> +class ip_record : public exec_node {
> +public:
> + static void* operator new(size_t size, void *ctx)
> + {
> + void *node;
> +
> + node = rzalloc_size(ctx, size);
> + assert(node != NULL);
> +
> + return node;
> + }
> +
> + ip_record(int ip)
> + {
> + this->ip = ip;
> + }
> +
> + int ip;
> +};
*grumble* I want a real boxed list *grumble*
(but I know that'll end up in no end of bike-shedding and fist-shaking,
so I'm not actually going to object to this...for now :))
> +
> class fs_inst : public exec_node {
> public:
> /* Callers of this ralloc-based new need not call delete. It's
> @@ -489,6 +509,7 @@ public:
> bool remove_duplicate_mrf_writes();
> bool virtual_grf_interferes(int a, int b);
> void schedule_instructions();
> + void patch_discard_jumps_to_fb_writes();
> void fail(const char *msg, ...);
>
> void push_force_uncompressed();
> @@ -572,6 +593,7 @@ public:
> struct gl_shader_program *prog;
> void *mem_ctx;
> exec_list instructions;
> + exec_list discard_halt_patches;
>
> /* Delayed setup of c->prog_data.params[] due to realloc of
> * ParamValues[] during compile.
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
> index 0c32f08..7bc0c2c 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
> @@ -37,11 +37,55 @@ extern "C" {
> #include "glsl/ir_print_visitor.h"
>
> void
> +fs_visitor::patch_discard_jumps_to_fb_writes()
> +{
> + if (intel->gen< 6)
> + return;
> +
> + /* There is a somewhat strange undocumented requirement of using
> + * HALT, according to the simulator. If some channel has HALTed to
> + * a particular UIP, then by the end of the program, every channel
> + * must have HALTed to that UIP. Furthermore, the tracking is a
> + * stack, so you can't do the final halt of a UIP after starting
> + * halting to a new UIP.
> + *
> + * Symptoms of not emitting this instruction on actual hardware
> + * included GPU hangs and sparkly rendering on the piglit discard
> + * tests.
> + */
> + struct brw_instruction *last_halt = gen6_HALT(p);
> + last_halt->bits3.break_cont.uip = 2;
> + last_halt->bits3.break_cont.jip = 2;
> +
> + int ip = p->nr_insn;
> +
> + foreach_list(node,&this->discard_halt_patches) {
> + ip_record *patch_ip = (ip_record *)node;
> + struct brw_instruction *patch =&p->store[patch_ip->ip];
> + int br = (intel->gen>= 5) ? 2 : 1;
> +
> + /* HALT takes a distance from the pre-incremented IP, so '1'
> + * would be the next instruction after jmpi.
> + */
> + assert(patch->header.opcode == BRW_OPCODE_HALT);
> + patch->bits3.break_cont.uip = (ip - patch_ip->ip) * br;
> + }
> +
> + this->discard_halt_patches.make_empty();
> +}
As you noted, this does emit HALT even when there are no discards.
Assuming you fix that,
Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>
It would be nice to do this on Gen4-5 too. But we can add that to our
ongoing list.
> +
> +void
> fs_visitor::generate_fb_write(fs_inst *inst)
> {
> bool eot = inst->eot;
> struct brw_reg implied_header;
>
> + /* Note that the jumps emitted to this point mean that the g0 ->
> + * base_mrf setup must be inside of this function, so that we jump
> + * to a point containing it.
> + */
> + patch_discard_jumps_to_fb_writes();
> +
> /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
> * move, here's g1.
> */
> @@ -482,6 +526,17 @@ fs_visitor::generate_discard(fs_inst *inst)
> brw_set_mask_control(p, BRW_MASK_DISABLE);
> brw_AND(p, g1, f0, g1);
> brw_pop_insn_state(p);
> +
> + /* GLSL 1.30+ say that discarded channels should stop executing
> + * (so, for example, an infinite loop that would otherwise in
> + * just that channel does not occur.
> + *
> + * This HALT will be patched up at FB write time to point UIP at
> + * the end of the program, and at brw_uip_jip() JIP will be set
> + * to the end of the current block (or the program).
> + */
> + this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
> + gen6_HALT(p);
> } else {
> struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
More information about the mesa-dev
mailing list