[PATCH 2/2] r300/compiler: Implement hardware assisted loops for vertex shaders.

Tom Stellard tstellar at gmail.com
Thu Aug 5 10:19:00 PDT 2010


---
 src/gallium/drivers/r300/r300_emit.c               |   23 ++++
 src/gallium/drivers/r300/r300_reg.h                |   21 ++++
 src/gallium/drivers/r300/r300_state.c              |    4 +-
 src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c |    2 +-
 src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c |  113 ++++++++++++++++++--
 .../drivers/dri/r300/compiler/r3xx_vertprog_dump.c |   14 +++
 src/mesa/drivers/dri/r300/compiler/radeon_code.h   |   14 +++
 .../dri/r300/compiler/radeon_emulate_loops.c       |    8 +-
 .../dri/r300/compiler/radeon_emulate_loops.h       |    3 +-
 src/mesa/drivers/dri/r300/r300_reg.h               |   21 ++++
 10 files changed, 206 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index 17e180a..17981e5 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -936,6 +936,29 @@ void r300_emit_vs_state(struct r300_context* r300, unsigned size, void* state)
             OUT_CS_TABLE(data, 4);
         }
     }
+
+    /* Emit flow control instructions. */
+    if (code->num_fc_ops) {
+        OUT_CS_REG(R300_VAP_PVS_FLOW_CNTL_OPC, code->fc_ops);
+        if (r300screen->caps.is_r500) {
+            for(i = 0; i < code->num_fc_ops; i++) {
+                OUT_CS_REG(R500_VAP_PVS_FLOW_CNTL_ADDRS_LW_0 + (8 * i),
+                    code->fc_op_addrs[i].lw);
+                OUT_CS_REG(R500_VAP_PVS_FLOW_CNTL_ADDRS_UW_0 + (8 * i),
+                    code->fc_op_addrs[i].uw);
+                OUT_CS_REG(R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0 + (4 * i),
+                    code->fc_loop_index[i]);
+            }
+        } else {
+            for(i = 0; i < code->num_fc_ops; i++) {
+                OUT_CS_REG(R300_VAP_PVS_FLOW_CNTL_0 + (4 * i),
+                    code->fc_op_addrs[i].lw);
+                OUT_CS_REG(R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0 + (4 * i),
+                    code->fc_loop_index[i]);
+            }
+        }
+    }
+
     END_CS;
 }
 
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
index 99a9d65..1f68f66 100644
--- a/src/gallium/drivers/r300/r300_reg.h
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -496,6 +496,12 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_VAP_GB_HORZ_CLIP_ADJ                   0x2228
 #define R300_VAP_GB_HORZ_DISC_ADJ                   0x222c
 
+#define R300_VAP_PVS_FLOW_CNTL_0            0x2230
+#define R300_PVS_FC_ACT_ADRS(x)             ((x) << 0)
+#define R300_PVS_FC_LOOP_CNT_JMP_INST(x)    ((x) << 8)
+#define R300_PVS_FC_LAST_INST(x)            ((x) << 16)
+#define R300_PVS_FC_RTN_INST(x)             ((x) << 24)
+
 /* gap */
 
 /* Sometimes, END_OF_PKT and 0x2284=0 are the only commands sent between
@@ -514,6 +520,10 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_2288_R300                    0x00750000 /* -- nh */
 #       define R300_2288_RV350                   0x0000FFFF /* -- Vladimir */
 
+#define R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0 0x2290
+#define R300_PVS_FC_LOOP_INIT_VAL(x)        ((x) << 0)
+#define R300_PVS_FC_LOOP_STEP_VAL(x)        ((x) << 8)
+
 /* gap */
 
 /* Addresses are relative to the vertex program instruction area of the
@@ -548,6 +558,9 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_VAP_PVS_CODE_CNTL_1	    0x22D8
 #       define R300_PVS_LAST_VTX_SRC_INST_SHIFT  0
 #define R300_VAP_PVS_FLOW_CNTL_OPC          0x22DC
+#define R300_VAP_PVS_FC_OPC_JUMP(x)         (1 << (2 * (x)))
+#define R300_VAP_PVS_FC_OPC_LOOP(x)         (2 << (2 * (x)))
+#define R300_VAP_PVS_FC_OPC_JSR(x)          (3 << (2 * (x)))
 
 /* The entire range from 0x2300 to 0x2AC inclusive seems to be used for
  * immediate vertices
@@ -564,6 +577,14 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 /* write 0 to indicate end of packet? */
 #define R300_VAP_VTX_END_OF_PKT             0x24AC
 
+#define R500_VAP_PVS_FLOW_CNTL_ADDRS_LW_0   0x2500
+#define R500_PVS_FC_ACT_ADRS(x)             ((x) << 0)
+#define R500_PVS_FC_LOOP_CNT_JMP_INST(x)    ((x) << 16)
+
+#define R500_VAP_PVS_FLOW_CNTL_ADDRS_UW_0   0x2504
+#define R500_PVS_FC_LAST_INST(x)            ((x) << 0)
+#define R500_PVS_FC_RTN_INST(x)             ((x) << 16)
+
 /* gap */
 
 /* These are values from r300_reg/r300_reg.h - they are known to be correct
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index 9db5e9e..73bb2ec 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -1758,10 +1758,12 @@ static void r300_bind_vs_state(struct pipe_context* pipe, void* shader)
     r300->rs_block_state.dirty = TRUE; /* Will be updated before the emission. */
 
     if (r300->screen->caps.has_tcl) {
+        unsigned fc_op_dwords = r300->screen->caps.is_r500 ? 6 : 4;
         r300->vs_state.dirty = TRUE;
         r300->vs_state.size =
                 vs->code.length + 9 +
-                (vs->immediates_count ? vs->immediates_count * 4 + 3 : 0);
+                (vs->immediates_count ? vs->immediates_count * 4 + 3 : 0) +
+        (vs->code.num_fc_ops ? vs->code.num_fc_ops * fc_op_dwords + 2 : 0);
 
         if (vs->externals_count) {
             r300->vs_constants.dirty = TRUE;
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
index c6246a8..0709394 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
@@ -113,7 +113,7 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 		debug_program_log(c, "after unroll loops");
 	}
 	else{
-		rc_transform_loops(&c->Base, &loop_state);
+		rc_transform_loops(&c->Base, &loop_state, R300_PFS_MAX_ALU_INST);
 		debug_program_log(c, "after transform loops");
 
 		rc_emulate_branches(&c->Base);
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
index e940fed..c895242 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
@@ -32,6 +32,11 @@
 #include "radeon_emulate_branches.h"
 #include "radeon_emulate_loops.h"
 
+struct loop {
+	int BgnLoop;
+
+};
+
 /*
  * Take an already-setup and valid source then swizzle it appropriately to
  * obtain a constant ZERO or ONE source.
@@ -337,6 +342,10 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
 {
 	struct rc_instruction *rci;
 
+	struct loop * loops;
+	int current_loop_depth = 0;
+	int loops_reserved = 0;
+
 	compiler->code->pos_end = 0;	/* Not supported yet */
 	compiler->code->length = 0;
 
@@ -385,6 +394,68 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
 		case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
 		case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
 		case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
+		case RC_OPCODE_BGNLOOP:
+		{
+			struct loop * l;
+
+			if ((!compiler->Base.is_r500
+				&& loops_reserved >= R300_VS_MAX_LOOP_DEPTH)
+				|| loops_reserved >= R500_VS_MAX_FC_DEPTH) {
+				rc_error(&compiler->Base,
+						"Loops are nested too deep.");
+				return;
+			}
+			memory_pool_array_reserve(&compiler->Base.Pool,
+					struct loop, loops, current_loop_depth,
+					loops_reserved, 1);
+			l = &loops[current_loop_depth++];
+			memset(l , 0, sizeof(struct loop));
+			l->BgnLoop = (compiler->code->length / 4);
+			continue;
+		}
+		case RC_OPCODE_ENDLOOP:
+		{
+			struct loop * l = &loops[current_loop_depth - 1];
+			unsigned int act_addr = l->BgnLoop - 1;
+			unsigned int last_addr = (compiler->code->length / 4) - 1;
+			unsigned int ret_addr = l->BgnLoop;
+
+			if (loops_reserved >= R300_VS_MAX_FC_OPS) {
+				rc_error(&compiler->Base,
+					"Too many flow control instructions.");
+				return;
+			}
+			if (compiler->Base.is_r500) {
+				compiler->code->fc_op_addrs
+					[compiler->code->num_fc_ops].lw =
+					R500_PVS_FC_ACT_ADRS(act_addr)
+					| R500_PVS_FC_LOOP_CNT_JMP_INST(0xffff)
+					;
+				compiler->code->fc_op_addrs
+					[compiler->code->num_fc_ops].uw =
+					R500_PVS_FC_LAST_INST(last_addr)
+					| R500_PVS_FC_RTN_INST(ret_addr)
+					;
+			} else {
+				compiler->code->fc_op_addrs
+					[compiler->code->num_fc_ops].lw =
+					R300_PVS_FC_ACT_ADRS(act_addr)
+					| R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
+					| R300_PVS_FC_LAST_INST(last_addr)
+					| R300_PVS_FC_RTN_INST(ret_addr)
+					;
+			}
+			compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
+				R300_PVS_FC_LOOP_INIT_VAL(0x0)
+				| R300_PVS_FC_LOOP_STEP_VAL(0x1)
+				;
+			compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
+						compiler->code->num_fc_ops);
+			compiler->code->num_fc_ops++;
+			current_loop_depth--;
+			continue;
+		}
+
 		default:
 			rc_error(&compiler->Base, "Unknown opcode %s\n", rc_get_opcode_info(vpi->Opcode)->Name);
 			return;
@@ -406,6 +477,7 @@ struct temporary_allocation {
 static void allocate_temporary_registers(struct r300_vertex_program_compiler * compiler)
 {
 	struct rc_instruction *inst;
+	struct rc_instruction *end_loop = NULL;
 	unsigned int num_orig_temps = 0;
 	char hwtemps[R300_VS_MAX_TEMPS];
 	struct temporary_allocation * ta;
@@ -440,10 +512,35 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
 	/* Pass 2: Determine original temporary lifetimes */
 	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
+		/* Instructions inside of loops need to use the ENDLOOP
+		 * instruction as their LastRead. */
+		if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
+			int endloops = 1;
+			struct rc_instruction * ptr;
+			for(ptr = inst->Next;
+				ptr != &compiler->Base.Program.Instructions;
+							ptr = ptr->Next){
+				if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
+					endloops++;
+				} else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
+					endloops--;
+					if (endloops <= 0) {
+						end_loop = ptr;
+						break;
+					}
+				}
+			}
+		}
+
+		if (inst == end_loop) {
+			end_loop = NULL;
+			continue;
+		}
 
 		for (i = 0; i < opcode->NumSrcRegs; ++i) {
 			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY)
-				ta[inst->U.I.SrcReg[i].Index].LastRead = inst;
+				ta[inst->U.I.SrcReg[i].Index].LastRead =
+						end_loop ? end_loop : inst;
 		}
 	}
 
@@ -640,17 +737,11 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 
 	debug_program_log(compiler, "before compilation");
 
-	/* XXX Ideally this should be done only for r3xx, but since
-	 * we don't have branching support for r5xx, we use the emulation
-	 * on all chipsets. */
+	if (compiler->Base.is_r500)
+		rc_transform_loops(&compiler->Base, &loop_state, R500_VS_MAX_ALU);
+	else
+		rc_transform_loops(&compiler->Base, &loop_state, R300_VS_MAX_ALU);
 
-	if (compiler->Base.is_r500){
-		rc_transform_loops(&compiler->Base, &loop_state);
-		rc_emulate_loops(&loop_state, R500_VS_MAX_ALU);
-	} else {
-		rc_transform_loops(&compiler->Base, &loop_state);
-		rc_emulate_loops(&loop_state, R300_VS_MAX_ALU);
-	}
 	debug_program_log(compiler, "after emulate loops");
 
 	rc_emulate_branches(&compiler->Base);
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c
index 5800f1a..0fabb9c 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c
@@ -177,4 +177,18 @@ void r300_vertex_program_dump(struct r300_vertex_program_code * vs)
 			r300_vs_src_dump(vs->body.d[offset+1+src]);
 		}
 	}
+
+	fprintf(stderr, "Flow Control Ops: 0x%08x\n",vs->fc_ops);
+	for(i = 0; i < vs->num_fc_ops; i++) {
+		switch((vs->fc_ops >> (i * 2)) & 0x3 ) {
+		case 0: fprintf(stderr, "NOP"); break;
+		case 1: fprintf(stderr, "JUMP"); break;
+		case 2: fprintf(stderr, "LOOP"); break;
+		case 3: fprintf(stderr, "JSR"); break;
+		}
+
+		fprintf(stderr,": lw-> 0x%08x uw-> 0x%08x\n",
+			vs->fc_op_addrs[i].lw,
+			vs->fc_op_addrs[i].uw);
+	}
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_code.h b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
index e14a352..b5b8370 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_code.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
@@ -243,6 +243,12 @@ struct rX00_fragment_program_code {
 #define R500_VS_MAX_ALU	        1024
 #define R500_VS_MAX_ALU_DWORDS  (R500_VS_MAX_ALU * 4)
 #define R300_VS_MAX_TEMPS	32
+/* This is the max for all chipsets (r300-r500) */
+#define R300_VS_MAX_FC_OPS 16
+/* The r500 maximum depth is not just for loops, but any combination of loops
+ * and subroutine jumps. */
+#define R500_VS_MAX_FC_DEPTH 8
+#define R300_VS_MAX_LOOP_DEPTH 1
 
 #define VSF_MAX_INPUTS 32
 #define VSF_MAX_OUTPUTS 32
@@ -263,6 +269,14 @@ struct r300_vertex_program_code {
 
 	uint32_t InputsRead;
 	uint32_t OutputsWritten;
+
+	unsigned int num_fc_ops;
+	uint32_t fc_ops;
+	struct {
+		uint32_t lw;
+		uint32_t uw;
+	} fc_op_addrs[R300_VS_MAX_FC_OPS];
+	uint32_t fc_loop_index[R300_VS_MAX_FC_OPS];
 };
 
 void r300_vertex_program_dump(struct r300_vertex_program_code * vs);
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
index 2a3306f..32d4b45 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
@@ -423,7 +423,8 @@ static int build_loop_info(struct radeon_compiler * c, struct loop_info * loop,
  * @param inst A pointer to a BGNLOOP instruction.
  * @return 1 for success, 0 for failure
  */
-int transform_loop(struct emulate_loop_state * s, struct rc_instruction * inst)
+static int transform_loop(struct emulate_loop_state * s,
+						struct rc_instruction * inst)
 {
 	struct loop_info * loop;
 
@@ -435,7 +436,7 @@ int transform_loop(struct emulate_loop_state * s, struct rc_instruction * inst)
 	if (!build_loop_info(s->C, loop, inst))
 		return 0;
 
-	if(try_unroll_loop(s->C, loop, -1)){
+	if(try_unroll_loop(s->C, loop, s->prog_inst_limit)){
 		return 1;
 	}
 
@@ -472,12 +473,13 @@ int transform_loop(struct emulate_loop_state * s, struct rc_instruction * inst)
 }
 
 void rc_transform_loops(struct radeon_compiler *c,
-						struct emulate_loop_state * s)
+			struct emulate_loop_state * s, int prog_inst_limit)
 {
 	struct rc_instruction * ptr;
 
 	memset(s, 0, sizeof(struct emulate_loop_state));
 	s->C = c;
+	s->prog_inst_limit = prog_inst_limit;
 	for(ptr = s->C->Program.Instructions.Next;
 			ptr != &s->C->Program.Instructions; ptr = ptr->Next) {
 		if(ptr->Type == RC_INSTRUCTION_NORMAL &&
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
index 86d91ef..bba1f68 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
@@ -21,10 +21,11 @@ struct emulate_loop_state {
 	struct loop_info * Loops;
 	unsigned int LoopCount;
 	unsigned int LoopReserved;
+	int prog_inst_limit;
 };
 
 void rc_transform_loops(struct radeon_compiler *c,
-						struct emulate_loop_state * s);
+			struct emulate_loop_state * s, int prog_inst_limit);
 
 void rc_unroll_loops(struct radeon_compiler * c, int prog_inst_limit);
 
diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h
index f25264b..a967b67 100644
--- a/src/mesa/drivers/dri/r300/r300_reg.h
+++ b/src/mesa/drivers/dri/r300/r300_reg.h
@@ -441,6 +441,12 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_VAP_GB_HORZ_CLIP_ADJ                   0x2228
 #define R300_VAP_GB_HORZ_DISC_ADJ                   0x222c
 
+#define R300_VAP_PVS_FLOW_CNTL_0            0x2230
+#define R300_PVS_FC_ACT_ADRS(x)             ((x) << 0)
+#define R300_PVS_FC_LOOP_CNT_JMP_INST(x)    ((x) << 8)
+#define R300_PVS_FC_LAST_INST(x)            ((x) << 16)
+#define R300_PVS_FC_RTN_INST(x)             ((x) << 24)
+
 /* gap */
 
 /* Sometimes, END_OF_PKT and 0x2284=0 are the only commands sent between
@@ -459,6 +465,10 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_2288_R300                    0x00750000 /* -- nh */
 #       define R300_2288_RV350                   0x0000FFFF /* -- Vladimir */
 
+#define R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0 0x2290
+#define R300_PVS_FC_LOOP_INIT_VAL(x)        ((x) << 0)
+#define R300_PVS_FC_LOOP_STEP_VAL(x)        ((x) << 8)
+
 /* gap */
 
 /* Addresses are relative to the vertex program instruction area of the
@@ -489,6 +499,9 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_VAP_PVS_CODE_CNTL_1	    0x22D8
 #       define R300_PVS_LAST_VTX_SRC_INST_SHIFT  0
 #define R300_VAP_PVS_FLOW_CNTL_OPC          0x22DC
+#define R300_VAP_PVS_FC_OPC_JUMP(x)         (1 << (2 * (x)))
+#define R300_VAP_PVS_FC_OPC_LOOP(x)         (2 << (2 * (x)))
+#define R300_VAP_PVS_FC_OPC_JSR(x)          (3 << (2 * (x)))
 
 /* The entire range from 0x2300 to 0x2AC inclusive seems to be used for
  * immediate vertices
@@ -505,6 +518,14 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 /* write 0 to indicate end of packet? */
 #define R300_VAP_VTX_END_OF_PKT             0x24AC
 
+#define R500_VAP_PVS_FLOW_CNTL_ADDRS_LW_0   0x2500
+#define R500_PVS_FC_ACT_ADRS(x)             ((x) << 0)
+#define R500_PVS_FC_LOOP_CNT_JMP_INST(x)    ((x) << 16)
+
+#define R500_VAP_PVS_FLOW_CNTL_ADDRS_UW_0   0x2504
+#define R500_PVS_FC_LAST_INST(x)            ((x) << 0)
+#define R500_PVS_FC_RTN_INST(x)             ((x) << 16)
+
 /* gap */
 
 /* These are values from r300_reg/r300_reg.h - they are known to be correct
-- 
1.7.1


--EVF5PPMfhYS0aIcm--


More information about the mesa-dev mailing list