Mesa (master): r300/compiler: Use hardware flow control instructions for loops on r500.

Marek Olšák mareko at kemper.freedesktop.org
Sat Jul 3 02:35:21 UTC 2010


Module: Mesa
Branch: master
Commit: f381c52081b2cbff31c2f38abf16dffcc08f681c
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=f381c52081b2cbff31c2f38abf16dffcc08f681c

Author: Tom Stellard <tstellar at gmail.com>
Date:   Fri Jun 18 21:20:57 2010 -0700

r300/compiler: Use hardware flow control instructions for loops on r500.

---

 src/gallium/drivers/r300/r300_fs.c                 |    3 +-
 src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c |   25 ++---
 src/mesa/drivers/dri/r300/compiler/r500_fragprog.c |   54 +++++++++--
 src/mesa/drivers/dri/r300/compiler/r500_fragprog.h |    4 +
 .../drivers/dri/r300/compiler/r500_fragprog_emit.c |  100 ++++++++++++++++----
 .../dri/r300/compiler/radeon_dataflow_deadcode.c   |    4 +
 .../drivers/dri/r300/compiler/radeon_opcodes.c     |    6 +
 .../drivers/dri/r300/compiler/radeon_opcodes.h     |    2 +
 8 files changed, 154 insertions(+), 44 deletions(-)

diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c
index 424f831..b145ded 100644
--- a/src/gallium/drivers/r300/r300_fs.c
+++ b/src/gallium/drivers/r300/r300_fs.c
@@ -246,13 +246,14 @@ static void r300_emit_fs_code_to_buffer(
     if (r300->screen->caps.is_r500) {
         struct r500_fragment_program_code *code = &generic_code->code.r500;
 
-        shader->cb_code_size = 17 +
+        shader->cb_code_size = 19 +
                                ((code->inst_end + 1) * 6) +
                                imm_count * 7;
 
         NEW_CB(shader->cb_code, shader->cb_code_size);
         OUT_CB_REG(R500_US_CONFIG, R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
         OUT_CB_REG(R500_US_PIXSIZE, code->max_temp_idx);
+        OUT_CB_REG(R500_US_FC_CTRL, code->us_fc_ctrl);
         OUT_CB_REG(R500_US_CODE_RANGE,
                    R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(code->inst_end));
         OUT_CB_REG(R500_US_CODE_OFFSET, 0);
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
index 147b071..b53571a 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
@@ -103,15 +103,14 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 
 	debug_program_log(c, "before compilation");
 
-	/* XXX Ideally this should be done only for r3xx, but since
-	 * we don't have branching support for r5xx, we use the emulation
-	 * on all chipsets. */
-
-	rc_transform_unroll_loops(&c->Base, &loop_state);
-	
-	debug_program_log(c, "after transform loops");
-	
-	if (!c->Base.is_r500){
+	if (c->Base.is_r500){
+		r500_transform_unroll_loops(&c->Base, &loop_state);	
+		debug_program_log(c, "after r500 transform loops");
+	}
+	else{
+		rc_transform_unroll_loops(&c->Base, &loop_state);
+		debug_program_log(c, "after transform loops");
+		
 		rc_emulate_branches(&c->Base);
 		debug_program_log(c, "after emulate branches");
 	}
@@ -161,14 +160,10 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 
 	debug_program_log(c, "after deadcode");
 
-	if(c->Base.is_r500){
-		rc_emulate_loops(&loop_state, R500_PFS_MAX_INST);
-	}
-	else{
+	if(!c->Base.is_r500){
 		rc_emulate_loops(&loop_state, R300_PFS_MAX_ALU_INST);
+		debug_program_log(c, "after emulate loops");
 	}
-	
-	debug_program_log(c, "after emulate looops");
 
 	rc_optimize(&c->Base);
 
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
index 350ce3a..e6b5522 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
@@ -30,6 +30,7 @@
 #include <stdio.h>
 
 #include "../r300_reg.h"
+#include "radeon_emulate_loops.h"
 
 /**
  * Rewrite IF instructions to use the ALU result special register.
@@ -59,6 +60,31 @@ int r500_transform_IF(
 	return 1;
 }
 
+/**
+ * Rewrite loops to make them easier to emit.  This is not a local
+ * transformation, because it modifies and reorders an entire block of code.
+ */
+void r500_transform_unroll_loops(struct radeon_compiler * c,
+						struct emulate_loop_state *s)
+{
+	int i;
+	
+	rc_transform_unroll_loops(c, s);
+	
+	for( i = s->LoopCount - 1; i >= 0; i-- ){
+		struct rc_instruction * inst_continue;
+		if(!s->Loops[i].EndLoop){
+			continue;
+		}
+		/* Insert a continue instruction at the end of the loop.  This
+		 * is required in order to emit loops correctly. */
+		inst_continue = rc_insert_new_instruction(c,
+						s->Loops[i].EndIf->Prev);
+		inst_continue->U.I.Opcode = RC_OPCODE_CONTINUE;
+	}
+
+}
+
 static int r500_swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
 {
 	unsigned int relevant;
@@ -322,6 +348,11 @@ void r500FragmentProgramDump(struct rX00_fragment_program_code *c)
     case R500_INST_TYPE_FC:
       fprintf(stderr, "\t2:FC_INST    0x%08x:", code->inst[n].inst2);
       inst = code->inst[n].inst2;
+      /* JUMP_FUNC JUMP_ANY*/
+      fprintf(stderr, "0x%02x %1x ", inst >> 8 & 0xff,
+          (inst & R500_FC_JUMP_ANY) >> 5);
+      
+      /* OP */
       switch(inst & 0x7){
       case R500_FC_OP_JUMP:
       	fprintf(stderr, "JUMP");
@@ -348,9 +379,8 @@ void r500FragmentProgramDump(struct rX00_fragment_program_code *c)
         fprintf(stderr, "CONTINUE");
         break;
       }
-      fprintf(stderr, " B_ELSE: %1x, JUMP_ANY: %1x", (inst & R500_FC_B_ELSE) >> 4,
-                                                     (inst & R500_FC_JUMP_ANY) >> 5);
-      fprintf(stderr, ", A_OP: ");
+      fprintf(stderr," "); 
+      /* A_OP */
       switch(inst & (0x3 << 6)){
       case R500_FC_A_OP_NONE:
         fprintf(stderr, "NONE");
@@ -362,11 +392,9 @@ void r500FragmentProgramDump(struct rX00_fragment_program_code *c)
         fprintf(stderr, "PUSH");
         break;
       }
-      fprintf(stderr, "\n\tJUMP_FUNC    0x%02x, B_POP_CNT: %d",
-                                                        (inst >> 8) & 0xff,
-                                                        (inst >> 16) & 0x1f);
+      /* B_OP0 B_OP1 */
       for(i=0; i<2; i++){
-        fprintf(stderr, ", B_OP%d: ", i);
+        fprintf(stderr, " ");
         switch(inst & (0x3 << (24 + (i * 2)))){
         /* R500_FC_B_OP0_NONE 
 	 * R500_FC_B_OP1_NONE */
@@ -383,9 +411,17 @@ void r500FragmentProgramDump(struct rX00_fragment_program_code *c)
           break;
         }
       }
-      fprintf(stderr, ", IGN_UNC: %1x\n", inst & R500_FC_IGNORE_UNCOVERED);
+      /*POP_CNT B_ELSE */
+      fprintf(stderr, " %d %1x", (inst >> 16) & 0x1f, (inst & R500_FC_B_ELSE) >> 4);
+      inst = code->inst[n].inst3;
+      /* JUMP_ADDR */
+      fprintf(stderr, " %d", inst >> 16);
+      
+      if(code->inst[n].inst2 & R500_FC_IGNORE_UNCOVERED){
+        fprintf(stderr, " IGN_UNC");
+      }
       inst = code->inst[n].inst3;
-      fprintf(stderr, "\t3:FC_ADDR    0x%08x:", inst);
+      fprintf(stderr, "\n\t3:FC_ADDR    0x%08x:", inst);
       fprintf(stderr, "BOOL: 0x%02x, INT: 0x%02x, JUMP_ADDR: %d, JMP_GLBL: %1x\n",
       inst & 0x1f, (inst >> 8) & 0x1f, (inst >> 16) & 0x1ff, inst >> 31); 
       break;
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h
index 4efbae7..0d005a7 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h
@@ -36,6 +36,8 @@
 #include "radeon_compiler.h"
 #include "radeon_swizzle.h"
 
+struct emulate_loop_state;
+
 extern void r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler);
 
 extern void r500FragmentProgramDump(struct rX00_fragment_program_code *c);
@@ -47,4 +49,6 @@ extern int r500_transform_IF(
 	struct rc_instruction * inst,
 	void* data);
 
+void r500_transform_unroll_loops(struct radeon_compiler * c,
+						struct emulate_loop_state * s);
 #endif
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
index fb2d8b5..0bd8f0a 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
@@ -45,6 +45,8 @@
 
 #include "radeon_program_pair.h"
 
+#define MAX_BRANCH_DEPTH_FULL 32
+#define MAX_BRANCH_DEPTH_PARTIAL 4
 
 #define PROG_CODE \
 	struct r500_fragment_program_code *code = &c->code->code.r500
@@ -61,6 +63,10 @@ struct branch_info {
 	int Endif;
 };
 
+struct loop_info {
+	int LoopStart;
+};
+
 struct emit_state {
 	struct radeon_compiler * C;
 	struct r500_fragment_program_code * Code;
@@ -69,7 +75,12 @@ struct emit_state {
 	unsigned int CurrentBranchDepth;
 	unsigned int BranchesReserved;
 
+	struct loop_info * Loops;
+	unsigned int CurrentLoopDepth;
+	unsigned int LoopsReserved;
+
 	unsigned int MaxBranchDepth;
+
 };
 
 static unsigned int translate_rgb_op(struct r300_fragment_program_compiler *c, rc_opcode opcode)
@@ -359,16 +370,49 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst
 
 	s->Code->inst[newip].inst0 = R500_INST_TYPE_FC | R500_INST_ALU_WAIT;
 
-	if (inst->U.I.Opcode == RC_OPCODE_IF) {
-		if (s->CurrentBranchDepth >= 32) {
+	switch(inst->U.I.Opcode){
+	struct branch_info * branch;
+	struct loop_info * loop;
+	case RC_OPCODE_BGNLOOP:
+		memory_pool_array_reserve(&s->C->Pool, struct loop_info,
+			s->Loops, s->CurrentLoopDepth, s->LoopsReserved, 1);
+
+		loop = &s->Loops[s->CurrentLoopDepth++];
+		
+		/* We don't emit an instruction for BGNLOOP, so we need to
+		 * decrement the instruction counter, but first we need to
+		 * set LoopStart to the current value of inst_end, which
+		 * will end up being the first real instruction in the loop.*/
+		loop->LoopStart = s->Code->inst_end--;
+		break;
+	
+	case RC_OPCODE_BRK:
+		/* Don't emit an instruction for BRK */
+		s->Code->inst_end--;
+		break;
+
+	case RC_OPCODE_CONTINUE:
+		loop = &s->Loops[s->CurrentLoopDepth - 1];
+		s->Code->inst[newip].inst2 = R500_FC_OP_JUMP |
+			R500_FC_JUMP_FUNC(0xff);
+		s->Code->inst[newip].inst3 = R500_FC_JUMP_ADDR(loop->LoopStart);
+		break;
+
+	case RC_OPCODE_ENDLOOP:
+		/* Don't emit an instruction for ENDLOOP */
+		s->Code->inst_end--;
+		s->CurrentLoopDepth--;
+		break;
+
+	case RC_OPCODE_IF:
+		if ( s->CurrentBranchDepth >= MAX_BRANCH_DEPTH_FULL) {
 			rc_error(s->C, "Branch depth exceeds hardware limit");
 			return;
 		}
-
 		memory_pool_array_reserve(&s->C->Pool, struct branch_info,
 				s->Branches, s->CurrentBranchDepth, s->BranchesReserved, 1);
 
-		struct branch_info * branch = &s->Branches[s->CurrentBranchDepth++];
+		branch = &s->Branches[s->CurrentBranchDepth++];
 		branch->If = newip;
 		branch->Else = -1;
 		branch->Endif = -1;
@@ -377,29 +421,50 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst
 			s->MaxBranchDepth = s->CurrentBranchDepth;
 
 		/* actual instruction is filled in at ENDIF time */
-	} else if (inst->U.I.Opcode == RC_OPCODE_ELSE) {
+		break;
+	
+	case RC_OPCODE_ELSE:
 		if (!s->CurrentBranchDepth) {
 			rc_error(s->C, "%s: got ELSE outside a branch", __FUNCTION__);
 			return;
 		}
 
-		struct branch_info * branch = &s->Branches[s->CurrentBranchDepth - 1];
+		branch = &s->Branches[s->CurrentBranchDepth - 1];
 		branch->Else = newip;
 
 		/* actual instruction is filled in at ENDIF time */
-	} else if (inst->U.I.Opcode == RC_OPCODE_ENDIF) {
+		break;
+
+	case RC_OPCODE_ENDIF:
 		if (!s->CurrentBranchDepth) {
 			rc_error(s->C, "%s: got ELSE outside a branch", __FUNCTION__);
 			return;
 		}
 
-		struct branch_info * branch = &s->Branches[s->CurrentBranchDepth - 1];
-		branch->Endif = newip;
-
+		branch = &s->Branches[s->CurrentBranchDepth - 1];
+		
+		if(inst->Prev->U.I.Opcode == RC_OPCODE_BRK){
+			branch->Endif = --s->Code->inst_end;
+			s->Code->inst[branch->Endif].inst2 |=
+				R500_FC_B_OP0_DECR;
+		}
+		else{
+			branch->Endif = newip;
+		
+			s->Code->inst[branch->Endif].inst2 = R500_FC_OP_JUMP
+				| R500_FC_A_OP_NONE /* no address stack */
+				| R500_FC_JUMP_ANY /* docs says set this, but I don't understand why */
+				| R500_FC_B_OP0_DECR /* decrement branch counter if stay */
+				| R500_FC_B_OP1_NONE /* no branch counter if stay */
+				| R500_FC_B_POP_CNT(1)
+			;
+			s->Code->inst[branch->Endif].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1);
+		}
 		s->Code->inst[branch->If].inst2 = R500_FC_OP_JUMP
 			| R500_FC_A_OP_NONE /* no address stack */
 			| R500_FC_JUMP_FUNC(0x0f) /* jump if ALU result is false */
 			| R500_FC_B_OP0_INCR /* increment branch counter if stay */
+			| R500_FC_IGNORE_UNCOVERED
 		;
 
 		if (branch->Else >= 0) {
@@ -421,17 +486,10 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst
 			s->Code->inst[branch->If].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1);
 		}
 
-		s->Code->inst[branch->Endif].inst2 = R500_FC_OP_JUMP
-			| R500_FC_A_OP_NONE /* no address stack */
-			| R500_FC_JUMP_ANY /* docs says set this, but I don't understand why */
-			| R500_FC_B_OP0_DECR /* decrement branch counter if stay */
-			| R500_FC_B_OP1_NONE /* no branch counter if stay */
-			| R500_FC_B_POP_CNT(1)
-		;
-		s->Code->inst[branch->Endif].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1);
 
 		s->CurrentBranchDepth--;
-	} else {
+		break;
+	default:
 		rc_error(s->C, "%s: unknown opcode %s\n", __FUNCTION__, rc_get_opcode_info(inst->U.I.Opcode)->Name);
 	}
 }
@@ -486,6 +544,10 @@ void r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compi
 		code->inst[ip].inst0 = R500_INST_TYPE_OUT | R500_INST_TEX_SEM_WAIT;
 	}
 
+	/* Use FULL flow control mode if branches are nested deep enough.
+	 * We don not need to enable FULL flow control mode for loops, becasue
+	 * we aren't using the hardware loop instructions.
+	 */
 	if (s.MaxBranchDepth >= 4) {
 		if (code->max_temp_idx < 1)
 			code->max_temp_idx = 1;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c
index f8bced2..fbb4235 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c
@@ -235,6 +235,10 @@ void rc_dataflow_deadcode(struct radeon_compiler * c, rc_dataflow_mark_outputs_f
 			}
 			break;
 		}
+		case RC_OPCODE_CONTINUE:
+		case RC_OPCODE_BRK:
+		case RC_OPCODE_BGNLOOP:
+			break;
 		case RC_OPCODE_ENDIF:
 			push_branch(&s);
 			break;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
index 1dc1685..128745a 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
@@ -386,6 +386,12 @@ struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = {
 		.NumSrcRegs = 0,
 	},
 	{
+		.Opcode = RC_OPCODE_CONTINUE,
+		.Name = "CONTINUE",
+		.IsFlowControl = 1,
+		.NumSrcRegs = 0
+	},
+	{
 		.Opcode = RC_OPCODE_REPL_ALPHA,
 		.Name = "REPL_ALPHA",
 		.HasDstReg = 1
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
index 91c82ac..e103ce5 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
@@ -187,6 +187,8 @@ typedef enum {
 
 	RC_OPCODE_ENDLOOP,
 
+	RC_OPCODE_CONTINUE,
+
 	/** special instruction, used in R300-R500 fragment program pair instructions
 	 * indicates that the result of the alpha operation shall be replicated
 	 * across all other channels */




More information about the mesa-commit mailing list