Mesa (master): r300/compiler: r500 hw support for break and continue in loops.

Tom Stellard tstellar at kemper.freedesktop.org
Wed Aug 4 04:03:12 UTC 2010


Module: Mesa
Branch: master
Commit: 2824d5687a19e42ba0da8fd08e80610c4469a3b3
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=2824d5687a19e42ba0da8fd08e80610c4469a3b3

Author: Tom Stellard <tstellar at gmail.com>
Date:   Tue Aug  3 15:23:23 2010 -0700

r300/compiler: r500 hw support for break and continue in loops.

The BGNLOOP and ENDLOOP instructions are now being used correctly, which
makes break and continue possible.  The deadcode pass has been modified to
handle breaks, and the compiler is more careful about which loops are
unrolled.

---

 src/gallium/drivers/r300/r300_fs.c                 |    9 +-
 src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c |    8 +-
 src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c |    9 +-
 src/mesa/drivers/dri/r300/compiler/r500_fragprog.c |   25 --
 src/mesa/drivers/dri/r300/compiler/r500_fragprog.h |    2 -
 .../drivers/dri/r300/compiler/r500_fragprog_emit.c |  114 +++++++---
 src/mesa/drivers/dri/r300/compiler/radeon_code.h   |    3 +
 .../dri/r300/compiler/radeon_dataflow_deadcode.c   |   39 +++-
 .../dri/r300/compiler/radeon_emulate_loops.c       |  237 ++++++++++++-------
 .../dri/r300/compiler/radeon_emulate_loops.h       |    9 +-
 .../drivers/dri/r300/compiler/radeon_optimize.c    |    3 +-
 11 files changed, 289 insertions(+), 169 deletions(-)

diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c
index db52699..87ff49a 100644
--- a/src/gallium/drivers/r300/r300_fs.c
+++ b/src/gallium/drivers/r300/r300_fs.c
@@ -248,13 +248,18 @@ static void r300_emit_fs_code_to_buffer(
 
         shader->cb_code_size = 19 +
                                ((code->inst_end + 1) * 6) +
-                               imm_count * 7;
+                               imm_count * 7 +
+			       code->int_constant_count * 2;
 
         NEW_CB(shader->cb_code, shader->cb_code_size);
         OUT_CB_REG(R500_US_CONFIG, R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
         OUT_CB_REG(R500_US_PIXSIZE, code->max_temp_idx);
         OUT_CB_REG(R500_US_FC_CTRL, code->us_fc_ctrl);
-        OUT_CB_REG(R500_US_CODE_RANGE,
+        for(i = 0; i < code->int_constant_count; i++){
+		OUT_CB_REG(R500_US_FC_INT_CONST_0 + (i * 4),
+						code->int_constants[i]);
+	}
+	OUT_CB_REG(R500_US_CODE_RANGE,
                    R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(code->inst_end));
         OUT_CB_REG(R500_US_CODE_OFFSET, 0);
         OUT_CB_REG(R500_US_CODE_ADDR,
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
index a326ee4..0709394 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
@@ -109,13 +109,13 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 	debug_program_log(c, "before compilation");
 
 	if (c->Base.is_r500){
-		r500_transform_unroll_loops(&c->Base, &loop_state);	
-		debug_program_log(c, "after r500 transform loops");
+		rc_unroll_loops(&c->Base, R500_PFS_MAX_INST);
+		debug_program_log(c, "after unroll loops");
 	}
 	else{
-		rc_transform_unroll_loops(&c->Base, &loop_state);
+		rc_transform_loops(&c->Base, &loop_state, R300_PFS_MAX_ALU_INST);
 		debug_program_log(c, "after transform loops");
-		
+
 		rc_emulate_branches(&c->Base);
 		debug_program_log(c, "after emulate branches");
 	}
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
index d347b4d..fe34ff6 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
@@ -633,7 +633,7 @@ static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 {
 	struct emulate_loop_state loop_state;
-	
+
 	compiler->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
 
 	addArtificialOutputs(compiler);
@@ -643,14 +643,13 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 	/* XXX Ideally this should be done only for r3xx, but since
 	 * we don't have branching support for r5xx, we use the emulation
 	 * on all chipsets. */
-	rc_transform_unroll_loops(&compiler->Base, &loop_state);
-	
-	debug_program_log(compiler, "after transform loops");
-	
+
 	if (compiler->Base.is_r500){
+		rc_transform_loops(&compiler->Base, &loop_state, R500_VS_MAX_ALU);
 		rc_emulate_loops(&loop_state, R500_VS_MAX_ALU);
 	} else {
 		rc_emulate_loops(&loop_state, R300_VS_MAX_ALU);
+		rc_transform_loops(&compiler->Base, &loop_state, R300_VS_MAX_ALU);
 	}
 	debug_program_log(compiler, "after emulate loops");
 
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
index e6b5522..95be619 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
@@ -60,31 +60,6 @@ int r500_transform_IF(
 	return 1;
 }
 
-/**
- * Rewrite loops to make them easier to emit.  This is not a local
- * transformation, because it modifies and reorders an entire block of code.
- */
-void r500_transform_unroll_loops(struct radeon_compiler * c,
-						struct emulate_loop_state *s)
-{
-	int i;
-	
-	rc_transform_unroll_loops(c, s);
-	
-	for( i = s->LoopCount - 1; i >= 0; i-- ){
-		struct rc_instruction * inst_continue;
-		if(!s->Loops[i].EndLoop){
-			continue;
-		}
-		/* Insert a continue instruction at the end of the loop.  This
-		 * is required in order to emit loops correctly. */
-		inst_continue = rc_insert_new_instruction(c,
-						s->Loops[i].EndIf->Prev);
-		inst_continue->U.I.Opcode = RC_OPCODE_CONTINUE;
-	}
-
-}
-
 static int r500_swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
 {
 	unsigned int relevant;
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h
index 0d005a7..3417335 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h
@@ -49,6 +49,4 @@ extern int r500_transform_IF(
 	struct rc_instruction * inst,
 	void* data);
 
-void r500_transform_unroll_loops(struct radeon_compiler * c,
-						struct emulate_loop_state * s);
 #endif
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
index 0bd8f0a..c3f817a 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
@@ -64,7 +64,12 @@ struct branch_info {
 };
 
 struct loop_info {
-	int LoopStart;
+	int BgnLoop;
+
+	int BranchDepth;
+	int * Brks;
+	int BrkCount;
+	int BrkReserved;
 };
 
 struct emit_state {
@@ -368,6 +373,12 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst
 
 	unsigned int newip = ++s->Code->inst_end;
 
+	/* Currently all loops use the same integer constant to intialize
+	 * the loop variables. */
+	if(!s->Code->int_constants[0]) {
+		s->Code->int_constants[0] = R500_FC_INT_CONST_KR(0xff);
+		s->Code->int_constant_count = 1;
+	}
 	s->Code->inst[newip].inst0 = R500_INST_TYPE_FC | R500_INST_ALU_WAIT;
 
 	switch(inst->U.I.Opcode){
@@ -378,32 +389,69 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst
 			s->Loops, s->CurrentLoopDepth, s->LoopsReserved, 1);
 
 		loop = &s->Loops[s->CurrentLoopDepth++];
-		
-		/* We don't emit an instruction for BGNLOOP, so we need to
-		 * decrement the instruction counter, but first we need to
-		 * set LoopStart to the current value of inst_end, which
-		 * will end up being the first real instruction in the loop.*/
-		loop->LoopStart = s->Code->inst_end--;
+		memset(loop, 0, sizeof(struct loop_info));
+		loop->BranchDepth = s->CurrentBranchDepth;
+		loop->BgnLoop = newip;
+
+		s->Code->inst[newip].inst2 = R500_FC_OP_LOOP
+			| R500_FC_JUMP_FUNC(0x00)
+			| R500_FC_IGNORE_UNCOVERED
+			;
 		break;
-	
 	case RC_OPCODE_BRK:
-		/* Don't emit an instruction for BRK */
-		s->Code->inst_end--;
+		loop = &s->Loops[s->CurrentLoopDepth - 1];
+		memory_pool_array_reserve(&s->C->Pool, int, loop->Brks,
+					loop->BrkCount, loop->BrkReserved, 1);
+
+		loop->Brks[loop->BrkCount++] = newip;
+		s->Code->inst[newip].inst2 = R500_FC_OP_BREAKLOOP
+			| R500_FC_JUMP_FUNC(0xff)
+			| R500_FC_B_OP1_DECR
+			| R500_FC_B_POP_CNT(
+				s->CurrentBranchDepth - loop->BranchDepth)
+			| R500_FC_IGNORE_UNCOVERED
+			;
 		break;
 
 	case RC_OPCODE_CONTINUE:
 		loop = &s->Loops[s->CurrentLoopDepth - 1];
-		s->Code->inst[newip].inst2 = R500_FC_OP_JUMP |
-			R500_FC_JUMP_FUNC(0xff);
-		s->Code->inst[newip].inst3 = R500_FC_JUMP_ADDR(loop->LoopStart);
+		s->Code->inst[newip].inst2 = R500_FC_OP_JUMP
+			| R500_FC_JUMP_FUNC(0xff)
+			| R500_FC_B_OP1_DECR
+			| R500_FC_B_POP_CNT(
+				s->CurrentBranchDepth -	loop->BranchDepth)
+			;
+		s->Code->inst[newip].inst3 = R500_FC_JUMP_ADDR(loop->BgnLoop);
 		break;
 
 	case RC_OPCODE_ENDLOOP:
-		/* Don't emit an instruction for ENDLOOP */
-		s->Code->inst_end--;
+	{
+		unsigned int i;
+		loop = &s->Loops[s->CurrentLoopDepth - 1];
+		/* Emit ENDLOOP */
+		s->Code->inst[newip].inst2 = R500_FC_OP_ENDLOOP
+			| R500_FC_JUMP_FUNC(0xff)
+			| R500_FC_JUMP_ANY
+			| R500_FC_IGNORE_UNCOVERED
+			;
+		/* The constant integer at index 0 is used by all loops. */
+		s->Code->inst[newip].inst3 = R500_FC_INT_ADDR(0)
+			| R500_FC_JUMP_ADDR(loop->BgnLoop + 1)
+			;
+
+		/* Set jump address and int constant for BGNLOOP */
+		s->Code->inst[loop->BgnLoop].inst3 = R500_FC_INT_ADDR(0)
+			| R500_FC_JUMP_ADDR(newip)
+			;
+
+		/* Set jump address for the BRK instructions. */
+		while(loop->BrkCount--) {
+			s->Code->inst[loop->Brks[loop->BrkCount]].inst3 =
+						R500_FC_JUMP_ADDR(newip + 1);
+		}
 		s->CurrentLoopDepth--;
 		break;
-
+	}
 	case RC_OPCODE_IF:
 		if ( s->CurrentBranchDepth >= MAX_BRANCH_DEPTH_FULL) {
 			rc_error(s->C, "Branch depth exceeds hardware limit");
@@ -442,24 +490,16 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst
 		}
 
 		branch = &s->Branches[s->CurrentBranchDepth - 1];
-		
-		if(inst->Prev->U.I.Opcode == RC_OPCODE_BRK){
-			branch->Endif = --s->Code->inst_end;
-			s->Code->inst[branch->Endif].inst2 |=
-				R500_FC_B_OP0_DECR;
-		}
-		else{
-			branch->Endif = newip;
-		
-			s->Code->inst[branch->Endif].inst2 = R500_FC_OP_JUMP
-				| R500_FC_A_OP_NONE /* no address stack */
-				| R500_FC_JUMP_ANY /* docs says set this, but I don't understand why */
-				| R500_FC_B_OP0_DECR /* decrement branch counter if stay */
-				| R500_FC_B_OP1_NONE /* no branch counter if stay */
-				| R500_FC_B_POP_CNT(1)
+		branch->Endif = newip;
+
+		s->Code->inst[branch->Endif].inst2 = R500_FC_OP_JUMP
+			| R500_FC_A_OP_NONE /* no address stack */
+			| R500_FC_JUMP_ANY /* docs says set this, but I don't understand why */
+			| R500_FC_B_OP0_DECR /* decrement branch counter if stay */
+			| R500_FC_B_OP1_NONE /* no branch counter if stay */
+			| R500_FC_B_POP_CNT(1)
 			;
-			s->Code->inst[branch->Endif].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1);
-		}
+		s->Code->inst[branch->Endif].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1);
 		s->Code->inst[branch->If].inst2 = R500_FC_OP_JUMP
 			| R500_FC_A_OP_NONE /* no address stack */
 			| R500_FC_JUMP_FUNC(0x0f) /* jump if ALU result is false */
@@ -544,11 +584,9 @@ void r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compi
 		code->inst[ip].inst0 = R500_INST_TYPE_OUT | R500_INST_TEX_SEM_WAIT;
 	}
 
-	/* Use FULL flow control mode if branches are nested deep enough.
-	 * We don not need to enable FULL flow control mode for loops, becasue
-	 * we aren't using the hardware loop instructions.
-	 */
-	if (s.MaxBranchDepth >= 4) {
+	/* Enable full flow control mode if we are using loops or have if
+	 * statements nested at least four deep. */
+	if (s.MaxBranchDepth >= 4 || s.LoopsReserved > 0) {
 		if (code->max_temp_idx < 1)
 			code->max_temp_idx = 1;
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_code.h b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
index d036897..e14a352 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_code.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
@@ -221,6 +221,9 @@ struct r500_fragment_program_code {
 	int max_temp_idx;
 
 	uint32_t us_fc_ctrl;
+
+	uint32_t int_constants[32];
+	uint32_t int_constant_count;
 };
 
 struct rX00_fragment_program_code {
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c
index fbb4235..31566a9 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c
@@ -43,6 +43,12 @@ struct instruction_state {
 	unsigned char SrcReg[3];
 };
 
+struct loopinfo {
+	struct updatemask_state * Breaks;
+	unsigned int BreakCount;
+	unsigned int BreaksReserved;
+};
+
 struct branchinfo {
 	unsigned int HaveElse:1;
 
@@ -59,6 +65,10 @@ struct deadcode_state {
 	struct branchinfo * BranchStack;
 	unsigned int BranchStackSize;
 	unsigned int BranchStackReserved;
+
+	struct loopinfo * LoopStack;
+	unsigned int LoopStackSize;
+	unsigned int LoopStackReserved;
 };
 
 
@@ -78,6 +88,22 @@ static void or_updatemasks(
 	dst->Address = a->Address | b->Address;
 }
 
+static void push_break(struct deadcode_state *s)
+{
+	struct loopinfo * loop = &s->LoopStack[s->LoopStackSize - 1];
+	memory_pool_array_reserve(&s->C->Pool, struct updatemask_state,
+		loop->Breaks, loop->BreakCount, loop->BreaksReserved, 1);
+
+	memcpy(&loop->Breaks[loop->BreakCount++], &s->R, sizeof(s->R));
+}
+
+static void push_loop(struct deadcode_state * s)
+{
+	memory_pool_array_reserve(&s->C->Pool, struct loopinfo, s->LoopStack,
+			s->LoopStackSize, s->LoopStackReserved, 1);
+	memset(&s->LoopStack[s->LoopStackSize++], 0, sizeof(struct loopinfo));
+}
+
 static void push_branch(struct deadcode_state * s)
 {
 	memory_pool_array_reserve(&s->C->Pool, struct branchinfo, s->BranchStack,
@@ -233,11 +259,22 @@ void rc_dataflow_deadcode(struct radeon_compiler * c, rc_dataflow_mark_outputs_f
 					}
 				}
 			}
+			push_loop(&s);
 			break;
 		}
-		case RC_OPCODE_CONTINUE:
 		case RC_OPCODE_BRK:
+			push_break(&s);
+			break;
 		case RC_OPCODE_BGNLOOP:
+		{
+			unsigned int i;
+			struct loopinfo * loop = &s.LoopStack[s.LoopStackSize-1];
+			for(i = 0; i < loop->BreakCount; i++) {
+				or_updatemasks(&s.R, &s.R, &loop->Breaks[i]);
+			}
+			break;
+		}
+		case RC_OPCODE_CONTINUE:
 			break;
 		case RC_OPCODE_ENDIF:
 			push_branch(&s);
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
index fed4d88..94e3e5f 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
@@ -39,7 +39,6 @@
 #define DBG(...) do { if (VERBOSE) fprintf(stderr, __VA_ARGS__); } while(0)
 
 struct const_value {
-	
 	struct radeon_compiler * C;
 	struct rc_src_register * Src;
 	float Value;
@@ -78,17 +77,17 @@ static int src_reg_is_immediate(struct rc_src_register * src,
 	c->Program.Constants.Constants[src->Index].Type==RC_CONSTANT_IMMEDIATE;
 }
 
-static unsigned int loop_calc_iterations(struct emulate_loop_state *s, 
-			struct loop_info * loop, unsigned int max_instructions)
+static unsigned int loop_max_possible_iterations(struct radeon_compiler *c,
+			struct loop_info * loop, unsigned int prog_inst_limit)
 {
-	unsigned int total_i = rc_recompute_ips(s->C);
+	unsigned int total_i = rc_recompute_ips(c);
 	unsigned int loop_i = (loop->EndLoop->IP - loop->BeginLoop->IP) - 1;
 	/* +1 because the program already has one iteration of the loop. */
-	return 1 + ((max_instructions - total_i) / (s->LoopCount * loop_i));
+	return 1 + ((prog_inst_limit - total_i) / loop_i);
 }
 
-static void loop_unroll(struct emulate_loop_state * s,
-			struct loop_info *loop, unsigned int iterations)
+static void unroll_loop(struct radeon_compiler * c, struct loop_info * loop,
+						unsigned int iterations)
 {
 	unsigned int i;
 	struct rc_instruction * ptr;
@@ -99,7 +98,7 @@ static void loop_unroll(struct emulate_loop_state * s,
 	rc_remove_instruction(loop->EndLoop);
 	for( i = 1; i < iterations; i++){
 		for(ptr = first; ptr != last->Next; ptr = ptr->Next){
-			struct rc_instruction *new = rc_alloc_instruction(s->C);
+			struct rc_instruction *new = rc_alloc_instruction(c);
 			memcpy(new, ptr, sizeof(struct rc_instruction));
 			rc_insert_instruction(append_to, new);
 			append_to = new;
@@ -115,7 +114,7 @@ static void update_const_value(void * data, struct rc_instruction * inst,
 	if(value->Src->File != file ||
 	   value->Src->Index != index ||
 	   !(1 << GET_SWZ(value->Src->Swizzle, 0) & mask)){
-	   	return;
+		return;
 	}
 	switch(inst->U.I.Opcode){
 	case RC_OPCODE_MOV:
@@ -140,7 +139,7 @@ static void get_incr_amount(void * data, struct rc_instruction * inst,
 	if(file != RC_FILE_TEMPORARY ||
 	   count_inst->Index != index ||
 	   (1 << GET_SWZ(count_inst->Swz,0) != mask)){
-	   	return;
+		return;
 	}
 	/* Find the index of the counter register. */
 	opcode = rc_get_opcode_info(inst->U.I.Opcode);
@@ -185,11 +184,10 @@ static void get_incr_amount(void * data, struct rc_instruction * inst,
 		count_inst->Unknown = 1;
 		return;
 	}
-	
 }
 
-static int transform_const_loop(struct emulate_loop_state * s,
-						struct loop_info * loop)
+static int try_unroll_loop(struct radeon_compiler * c, struct loop_info * loop,
+						unsigned int prog_inst_limit)
 {
 	int end_loops;
 	int iterations;
@@ -201,12 +199,12 @@ static int transform_const_loop(struct emulate_loop_state * s,
 	struct rc_instruction * inst;
 
 	/* Find the counter and the upper limit */
-	
-	if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[0], s->C)){
+
+	if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[0], c)){
 		limit = &loop->Cond->U.I.SrcReg[0];
 		counter = &loop->Cond->U.I.SrcReg[1];
 	}
-	else if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[1], s->C)){
+	else if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[1], c)){
 		limit = &loop->Cond->U.I.SrcReg[1];
 		counter = &loop->Cond->U.I.SrcReg[0];
 	}
@@ -214,13 +212,13 @@ static int transform_const_loop(struct emulate_loop_state * s,
 		DBG("No constant limit.\n");
 		return 0;
 	}
-	
+
 	/* Find the initial value of the counter */
 	counter_value.Src = counter;
 	counter_value.Value = 0.0f;
 	counter_value.HasValue = 0;
-	counter_value.C = s->C;
-	for(inst = s->C->Program.Instructions.Next; inst != loop->BeginLoop;
+	counter_value.C = c;
+	for(inst = c->Program.Instructions.Next; inst != loop->BeginLoop;
 							inst = inst->Next){
 		rc_for_all_writes_mask(inst, update_const_value, &counter_value);
 	}
@@ -230,7 +228,7 @@ static int transform_const_loop(struct emulate_loop_state * s,
 	}
 	DBG("Initial counter value is %f\n", counter_value.Value);
 	/* Determine how the counter is modified each loop */
-	count_inst.C = s->C;
+	count_inst.C = c;
 	count_inst.Index = counter->Index;
 	count_inst.Swz = counter->Swizzle;
 	count_inst.Amount = 0.0f;
@@ -277,17 +275,20 @@ static int transform_const_loop(struct emulate_loop_state * s,
 	/* Calculate the number of iterations of this loop.  Keeping this
 	 * simple, since we only support increment and decrement loops.
 	 */
-	limit_value = get_constant_value(s->C, limit, 0);
+	limit_value = get_constant_value(c, limit, 0);
 	DBG("Limit is %f.\n", limit_value);
+	/* The iteration calculations are opposite of what you would expect.
+	 * In a normal loop, if the condition is met, then loop continues, but
+	 * with our loops, if the condition is met, the is exited. */
 	switch(loop->Cond->U.I.Opcode){
-	case RC_OPCODE_SGT:
-	case RC_OPCODE_SLT:
+	case RC_OPCODE_SGE:
+	case RC_OPCODE_SLE:
 		iterations = (int) ceilf((limit_value - counter_value.Value) /
 							count_inst.Amount);
 		break;
 
-	case RC_OPCODE_SLE:
-	case RC_OPCODE_SGE:
+	case RC_OPCODE_SGT:
+	case RC_OPCODE_SLT:
 		iterations = (int) floorf((limit_value - counter_value.Value) /
 							count_inst.Amount) + 1;
 		break;
@@ -295,77 +296,84 @@ static int transform_const_loop(struct emulate_loop_state * s,
 		return 0;
 	}
 
+	if (iterations > loop_max_possible_iterations(c, loop,
+							prog_inst_limit)) {
+		return 0;
+	}
+
 	DBG("Loop will have %d iterations.\n", iterations);
-	
+
 	/* Prepare loop for unrolling */
 	rc_remove_instruction(loop->Cond);
 	rc_remove_instruction(loop->If);
 	rc_remove_instruction(loop->Brk);
 	rc_remove_instruction(loop->EndIf);
-	
-	loop_unroll(s, loop, iterations);
+
+	unroll_loop(c, loop, iterations);
 	loop->EndLoop = NULL;
 	return 1;
 }
 
-/** 
- * This function prepares a loop to be unrolled by converting it into an if
- * statement.  Here is an outline of the conversion process:
- * BGNLOOP;                         	-> BGNLOOP;
- * <Additional conditional code>	-> <Additional conditional code>
- * SGE/SLT temp[0], temp[1], temp[2];	-> SLT/SGE temp[0], temp[1], temp[2];
- * IF temp[0];                      	-> IF temp[0];
- * BRK;                             	->
- * ENDIF;                           	-> <Loop Body>
- * <Loop Body>                      	-> ENDIF;
- * ENDLOOP;                         	-> ENDLOOP
- *
+/**
+ * @param c
+ * @param loop
  * @param inst A pointer to a BGNLOOP instruction.
- * @return If the loop can be unrolled, a pointer to the first instruction of
- * 		the unrolled loop.
- * 	   Otherwise, A pointer to the ENDLOOP instruction.
- * 	   Null if there is an error.
+ * @return 1 if all of the members of loop where set.
+ * @return 0 if there was an error and some members of loop are still NULL.
  */
-static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
+static int build_loop_info(struct radeon_compiler * c, struct loop_info * loop,
 						struct rc_instruction * inst)
 {
-	struct loop_info *loop;
 	struct rc_instruction * ptr;
 
-	memory_pool_array_reserve(&s->C->Pool, struct loop_info,
-			s->Loops, s->LoopCount, s->LoopReserved, 1);
-
-	loop = &s->Loops[s->LoopCount++];
-	memset(loop, 0, sizeof(struct loop_info));
 	if(inst->U.I.Opcode != RC_OPCODE_BGNLOOP){
-		rc_error(s->C, "expected BGNLOOP\n", __FUNCTION__);
-		return NULL;
+		rc_error(c, "%s: expected BGNLOOP", __FUNCTION__);
+		return 0;
 	}
+
+	memset(loop, 0, sizeof(struct loop_info));
+
 	loop->BeginLoop = inst;
 
-	for(ptr = loop->BeginLoop->Next; !loop->EndLoop; ptr = ptr->Next){
+	for(ptr = loop->BeginLoop->Next; !loop->EndLoop; ptr = ptr->Next) {
+
+		if (ptr == &c->Program.Instructions) {
+			rc_error(c, "%s: BGNLOOP without an ENDLOOOP.\n",
+								__FUNCTION__);
+			return 0;
+		}
+
 		switch(ptr->U.I.Opcode){
 		case RC_OPCODE_BGNLOOP:
-			/* Nested loop */
-			ptr = transform_loop(s, ptr);
-			if(!ptr){
-				return NULL;
+		{
+			/* Nested loop, skip ahead to the end. */
+			unsigned int loop_depth = 1;
+			for(ptr = ptr->Next; ptr != &c->Program.Instructions;
+							ptr = ptr->Next){
+				if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
+					loop_depth++;
+				} else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
+					if (!--loop_depth) {
+						break;
+					}
+				}
+			}
+			if (ptr == &c->Program.Instructions) {
+				rc_error(c, "%s: BGNLOOP without an ENDLOOOP\n",
+								__FUNCTION__);
+					return 0;
 			}
 			break;
+		}
 		case RC_OPCODE_BRK:
-			loop->Brk = ptr;
-			if(ptr->Next->U.I.Opcode != RC_OPCODE_ENDIF){
-				rc_error(s->C,
-					"%s: expected ENDIF\n",__FUNCTION__);
-				return NULL;
-			}
-			loop->EndIf = ptr->Next;
-			if(ptr->Prev->U.I.Opcode != RC_OPCODE_IF){
-				rc_error(s->C,
-					"%s: expected IF\n", __FUNCTION__);
-				return NULL;
+			if(ptr->Next->U.I.Opcode != RC_OPCODE_ENDIF
+					|| ptr->Prev->U.I.Opcode != RC_OPCODE_IF
+					|| loop->Brk){
+				continue;
 			}
+			loop->Brk = ptr;
 			loop->If = ptr->Prev;
+			loop->EndIf = ptr->Next;
 			switch(loop->If->Prev->U.I.Opcode){
 			case RC_OPCODE_SLT:
 			case RC_OPCODE_SGE:
@@ -375,18 +383,62 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
 			case RC_OPCODE_SNE:
 				break;
 			default:
-				rc_error(s->C, "%s expected conditional\n",
+				rc_error(c, "%s: expected conditional",
 								__FUNCTION__);
-				return NULL;
+				return 0;
 			}
 			loop->Cond = loop->If->Prev;
-			ptr = loop->EndIf;
 			break;
+
 		case RC_OPCODE_ENDLOOP:
 			loop->EndLoop = ptr;
 			break;
 		}
 	}
+
+	if (loop->BeginLoop && loop->Brk && loop->If && loop->EndIf
+					&& loop->Cond && loop->EndLoop) {
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * This function prepares a loop to be unrolled by converting it into an if
+ * statement.  Here is an outline of the conversion process:
+ * BGNLOOP;                         	-> BGNLOOP;
+ * <Additional conditional code>	-> <Additional conditional code>
+ * SGE/SLT temp[0], temp[1], temp[2];	-> SLT/SGE temp[0], temp[1], temp[2];
+ * IF temp[0];                      	-> IF temp[0];
+ * BRK;                             	->
+ * ENDIF;                           	-> <Loop Body>
+ * <Loop Body>                      	-> ENDIF;
+ * ENDLOOP;                         	-> ENDLOOP
+ *
+ * @param inst A pointer to a BGNLOOP instruction.
+ * @return If the loop can be unrolled, a pointer to the first instruction of
+ * 		the unrolled loop.
+ * 	   Otherwise, A pointer to the ENDLOOP instruction.
+ * 	   Null if there is an error.
+ */
+static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
+						struct rc_instruction * inst,
+						int prog_inst_limit)
+{
+	struct loop_info * loop;
+
+	memory_pool_array_reserve(&s->C->Pool, struct loop_info,
+			s->Loops, s->LoopCount, s->LoopReserved, 1);
+
+	loop = &s->Loops[s->LoopCount++];
+
+	if (!build_loop_info(s->C, loop, inst))
+		return NULL;
+
+	if(try_unroll_loop(s->C, loop, prog_inst_limit)){
+		return loop->BeginLoop->Next;
+	}
+
 	/* Reverse the conditional instruction */
 	switch(loop->Cond->U.I.Opcode){
 	case RC_OPCODE_SGE:
@@ -411,31 +463,27 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
 		rc_error(s->C, "loop->Cond is not a conditional.\n");
 		return NULL;
 	}
-	
-	/* Check if the number of loops is known at compile time. */
-	if(transform_const_loop(s, loop)){
-		return loop->BeginLoop->Next;
-	}
 
-	/* Prepare the loop to be unrolled */
+	/* Prepare the loop to be emulated */
 	rc_remove_instruction(loop->Brk);
 	rc_remove_instruction(loop->EndIf);
 	rc_insert_instruction(loop->EndLoop->Prev, loop->EndIf);
 	return loop->EndLoop;
 }
 
-void rc_transform_unroll_loops(struct radeon_compiler *c,
-					struct emulate_loop_state * s)
+void rc_transform_loops(struct radeon_compiler *c,
+					struct emulate_loop_state * s,
+					int prog_inst_limit)
 {
 	struct rc_instruction * ptr;
-	
+
 	memset(s, 0, sizeof(struct emulate_loop_state));
 	s->C = c;
 	ptr = s->C->Program.Instructions.Next;
 	while(ptr != &s->C->Program.Instructions) {
 		if(ptr->Type == RC_INSTRUCTION_NORMAL &&
 					ptr->U.I.Opcode == RC_OPCODE_BGNLOOP){
-			ptr = transform_loop(s, ptr);
+			ptr = transform_loop(s, ptr, prog_inst_limit);
 			if(!ptr){
 				return;
 			}
@@ -444,8 +492,23 @@ void rc_transform_unroll_loops(struct radeon_compiler *c,
 	}
 }
 
-void rc_emulate_loops(struct emulate_loop_state *s,
-						unsigned int max_instructions)
+void rc_unroll_loops(struct radeon_compiler *c, int prog_inst_limit)
+{
+	struct rc_instruction * inst;
+	struct loop_info loop;
+
+	for(inst = c->Program.Instructions.Next;
+			inst != &c->Program.Instructions; inst = inst->Next) {
+
+		if (inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
+			if (build_loop_info(c, &loop, inst)) {
+				try_unroll_loop(c, &loop, prog_inst_limit);
+			}
+		}
+	}
+}
+
+void rc_emulate_loops(struct emulate_loop_state *s, int prog_inst_limit)
 {
 	int i;
 	/* Iterate backwards of the list of loops so that loops that nested
@@ -455,8 +518,8 @@ void rc_emulate_loops(struct emulate_loop_state *s,
 		if(!s->Loops[i].EndLoop){
 			continue;
 		}
-		unsigned int iterations = loop_calc_iterations(s, &s->Loops[i],
-							max_instructions);
-		loop_unroll(s, &s->Loops[i], iterations);
+		unsigned int iterations = loop_max_possible_iterations(
+					s->C, &s->Loops[i], prog_inst_limit);
+		unroll_loop(s->C, &s->Loops[i], iterations);
 	}
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
index 7748813..339527b 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
@@ -23,10 +23,11 @@ struct emulate_loop_state {
 	unsigned int LoopReserved;
 };
 
-void rc_transform_unroll_loops(struct radeon_compiler *c,
-					struct emulate_loop_state * s);
+void rc_transform_loops(struct radeon_compiler *c,
+			struct emulate_loop_state * s, int prog_inst_limit);
 
-void rc_emulate_loops(struct emulate_loop_state *s,
-					unsigned int max_instructions);
+void rc_unroll_loops(struct radeon_compiler * c, int prog_inst_limit);
+
+void rc_emulate_loops(struct emulate_loop_state * s, int prog_inst_limit);
 
 #endif /* RADEON_EMULATE_LOOPS_H */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
index eca0651..7a3f359 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
@@ -164,7 +164,8 @@ static void peephole(struct radeon_compiler * c, struct rc_instruction * inst_mo
 	    inst = inst->Next) {
 		/* XXX In the future we might be able to make the optimizer
 		 * smart enough to handle loops. */
-		if(inst->U.I.Opcode == RC_OPCODE_BGNLOOP){
+		if(inst->U.I.Opcode == RC_OPCODE_BGNLOOP
+				|| inst->U.I.Opcode == RC_OPCODE_ENDLOOP){
 			return;
 		}
 		rc_for_all_reads_mask(inst, peephole_scan_read, &s);




More information about the mesa-commit mailing list