[PATCH 2/2] r300/compiler: Implement hardware assisted loops for vertex shaders.
Tom Stellard
tstellar at gmail.com
Thu Aug 5 10:19:00 PDT 2010
---
src/gallium/drivers/r300/r300_emit.c | 23 ++++
src/gallium/drivers/r300/r300_reg.h | 21 ++++
src/gallium/drivers/r300/r300_state.c | 4 +-
src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c | 2 +-
src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c | 113 ++++++++++++++++++--
.../drivers/dri/r300/compiler/r3xx_vertprog_dump.c | 14 +++
src/mesa/drivers/dri/r300/compiler/radeon_code.h | 14 +++
.../dri/r300/compiler/radeon_emulate_loops.c | 8 +-
.../dri/r300/compiler/radeon_emulate_loops.h | 3 +-
src/mesa/drivers/dri/r300/r300_reg.h | 21 ++++
10 files changed, 206 insertions(+), 17 deletions(-)
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index 17e180a..17981e5 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -936,6 +936,29 @@ void r300_emit_vs_state(struct r300_context* r300, unsigned size, void* state)
OUT_CS_TABLE(data, 4);
}
}
+
+ /* Emit flow control instructions. */
+ if (code->num_fc_ops) {
+ OUT_CS_REG(R300_VAP_PVS_FLOW_CNTL_OPC, code->fc_ops);
+ if (r300screen->caps.is_r500) {
+ for(i = 0; i < code->num_fc_ops; i++) {
+ OUT_CS_REG(R500_VAP_PVS_FLOW_CNTL_ADDRS_LW_0 + (8 * i),
+ code->fc_op_addrs[i].lw);
+ OUT_CS_REG(R500_VAP_PVS_FLOW_CNTL_ADDRS_UW_0 + (8 * i),
+ code->fc_op_addrs[i].uw);
+ OUT_CS_REG(R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0 + (4 * i),
+ code->fc_loop_index[i]);
+ }
+ } else {
+ for(i = 0; i < code->num_fc_ops; i++) {
+ OUT_CS_REG(R300_VAP_PVS_FLOW_CNTL_0 + (4 * i),
+ code->fc_op_addrs[i].lw);
+ OUT_CS_REG(R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0 + (4 * i),
+ code->fc_loop_index[i]);
+ }
+ }
+ }
+
END_CS;
}
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
index 99a9d65..1f68f66 100644
--- a/src/gallium/drivers/r300/r300_reg.h
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -496,6 +496,12 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
#define R300_VAP_GB_HORZ_CLIP_ADJ 0x2228
#define R300_VAP_GB_HORZ_DISC_ADJ 0x222c
+#define R300_VAP_PVS_FLOW_CNTL_0 0x2230
+#define R300_PVS_FC_ACT_ADRS(x) ((x) << 0)
+#define R300_PVS_FC_LOOP_CNT_JMP_INST(x) ((x) << 8)
+#define R300_PVS_FC_LAST_INST(x) ((x) << 16)
+#define R300_PVS_FC_RTN_INST(x) ((x) << 24)
+
/* gap */
/* Sometimes, END_OF_PKT and 0x2284=0 are the only commands sent between
@@ -514,6 +520,10 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
# define R300_2288_R300 0x00750000 /* -- nh */
# define R300_2288_RV350 0x0000FFFF /* -- Vladimir */
+#define R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0 0x2290
+#define R300_PVS_FC_LOOP_INIT_VAL(x) ((x) << 0)
+#define R300_PVS_FC_LOOP_STEP_VAL(x) ((x) << 8)
+
/* gap */
/* Addresses are relative to the vertex program instruction area of the
@@ -548,6 +558,9 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
#define R300_VAP_PVS_CODE_CNTL_1 0x22D8
# define R300_PVS_LAST_VTX_SRC_INST_SHIFT 0
#define R300_VAP_PVS_FLOW_CNTL_OPC 0x22DC
+#define R300_VAP_PVS_FC_OPC_JUMP(x) (1 << (2 * (x)))
+#define R300_VAP_PVS_FC_OPC_LOOP(x) (2 << (2 * (x)))
+#define R300_VAP_PVS_FC_OPC_JSR(x) (3 << (2 * (x)))
/* The entire range from 0x2300 to 0x2AC inclusive seems to be used for
* immediate vertices
@@ -564,6 +577,14 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
/* write 0 to indicate end of packet? */
#define R300_VAP_VTX_END_OF_PKT 0x24AC
+#define R500_VAP_PVS_FLOW_CNTL_ADDRS_LW_0 0x2500
+#define R500_PVS_FC_ACT_ADRS(x) ((x) << 0)
+#define R500_PVS_FC_LOOP_CNT_JMP_INST(x) ((x) << 16)
+
+#define R500_VAP_PVS_FLOW_CNTL_ADDRS_UW_0 0x2504
+#define R500_PVS_FC_LAST_INST(x) ((x) << 0)
+#define R500_PVS_FC_RTN_INST(x) ((x) << 16)
+
/* gap */
/* These are values from r300_reg/r300_reg.h - they are known to be correct
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index 9db5e9e..73bb2ec 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -1758,10 +1758,12 @@ static void r300_bind_vs_state(struct pipe_context* pipe, void* shader)
r300->rs_block_state.dirty = TRUE; /* Will be updated before the emission. */
if (r300->screen->caps.has_tcl) {
+ unsigned fc_op_dwords = r300->screen->caps.is_r500 ? 6 : 4;
r300->vs_state.dirty = TRUE;
r300->vs_state.size =
vs->code.length + 9 +
- (vs->immediates_count ? vs->immediates_count * 4 + 3 : 0);
+ (vs->immediates_count ? vs->immediates_count * 4 + 3 : 0) +
+ (vs->code.num_fc_ops ? vs->code.num_fc_ops * fc_op_dwords + 2 : 0);
if (vs->externals_count) {
r300->vs_constants.dirty = TRUE;
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
index c6246a8..0709394 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
@@ -113,7 +113,7 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
debug_program_log(c, "after unroll loops");
}
else{
- rc_transform_loops(&c->Base, &loop_state);
+ rc_transform_loops(&c->Base, &loop_state, R300_PFS_MAX_ALU_INST);
debug_program_log(c, "after transform loops");
rc_emulate_branches(&c->Base);
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
index e940fed..c895242 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
@@ -32,6 +32,11 @@
#include "radeon_emulate_branches.h"
#include "radeon_emulate_loops.h"
+struct loop {
+ int BgnLoop;
+
+};
+
/*
* Take an already-setup and valid source then swizzle it appropriately to
* obtain a constant ZERO or ONE source.
@@ -337,6 +342,10 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
{
struct rc_instruction *rci;
+ struct loop * loops;
+ int current_loop_depth = 0;
+ int loops_reserved = 0;
+
compiler->code->pos_end = 0; /* Not supported yet */
compiler->code->length = 0;
@@ -385,6 +394,68 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
+ case RC_OPCODE_BGNLOOP:
+ {
+ struct loop * l;
+
+ if ((!compiler->Base.is_r500
+ && loops_reserved >= R300_VS_MAX_LOOP_DEPTH)
+ || loops_reserved >= R500_VS_MAX_FC_DEPTH) {
+ rc_error(&compiler->Base,
+ "Loops are nested too deep.");
+ return;
+ }
+ memory_pool_array_reserve(&compiler->Base.Pool,
+ struct loop, loops, current_loop_depth,
+ loops_reserved, 1);
+ l = &loops[current_loop_depth++];
+ memset(l , 0, sizeof(struct loop));
+ l->BgnLoop = (compiler->code->length / 4);
+ continue;
+ }
+ case RC_OPCODE_ENDLOOP:
+ {
+ struct loop * l = &loops[current_loop_depth - 1];
+ unsigned int act_addr = l->BgnLoop - 1;
+ unsigned int last_addr = (compiler->code->length / 4) - 1;
+ unsigned int ret_addr = l->BgnLoop;
+
+ if (loops_reserved >= R300_VS_MAX_FC_OPS) {
+ rc_error(&compiler->Base,
+ "Too many flow control instructions.");
+ return;
+ }
+ if (compiler->Base.is_r500) {
+ compiler->code->fc_op_addrs
+ [compiler->code->num_fc_ops].lw =
+ R500_PVS_FC_ACT_ADRS(act_addr)
+ | R500_PVS_FC_LOOP_CNT_JMP_INST(0xffff)
+ ;
+ compiler->code->fc_op_addrs
+ [compiler->code->num_fc_ops].uw =
+ R500_PVS_FC_LAST_INST(last_addr)
+ | R500_PVS_FC_RTN_INST(ret_addr)
+ ;
+ } else {
+ compiler->code->fc_op_addrs
+ [compiler->code->num_fc_ops].lw =
+ R300_PVS_FC_ACT_ADRS(act_addr)
+ | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
+ | R300_PVS_FC_LAST_INST(last_addr)
+ | R300_PVS_FC_RTN_INST(ret_addr)
+ ;
+ }
+ compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
+ R300_PVS_FC_LOOP_INIT_VAL(0x0)
+ | R300_PVS_FC_LOOP_STEP_VAL(0x1)
+ ;
+ compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
+ compiler->code->num_fc_ops);
+ compiler->code->num_fc_ops++;
+ current_loop_depth--;
+ continue;
+ }
+
default:
rc_error(&compiler->Base, "Unknown opcode %s\n", rc_get_opcode_info(vpi->Opcode)->Name);
return;
@@ -406,6 +477,7 @@ struct temporary_allocation {
static void allocate_temporary_registers(struct r300_vertex_program_compiler * compiler)
{
struct rc_instruction *inst;
+ struct rc_instruction *end_loop = NULL;
unsigned int num_orig_temps = 0;
char hwtemps[R300_VS_MAX_TEMPS];
struct temporary_allocation * ta;
@@ -440,10 +512,35 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
/* Pass 2: Determine original temporary lifetimes */
for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
+ /* Instructions inside of loops need to use the ENDLOOP
+ * instruction as their LastRead. */
+ if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
+ int endloops = 1;
+ struct rc_instruction * ptr;
+ for(ptr = inst->Next;
+ ptr != &compiler->Base.Program.Instructions;
+ ptr = ptr->Next){
+ if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
+ endloops++;
+ } else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
+ endloops--;
+ if (endloops <= 0) {
+ end_loop = ptr;
+ break;
+ }
+ }
+ }
+ }
+
+ if (inst == end_loop) {
+ end_loop = NULL;
+ continue;
+ }
for (i = 0; i < opcode->NumSrcRegs; ++i) {
if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY)
- ta[inst->U.I.SrcReg[i].Index].LastRead = inst;
+ ta[inst->U.I.SrcReg[i].Index].LastRead =
+ end_loop ? end_loop : inst;
}
}
@@ -640,17 +737,11 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
debug_program_log(compiler, "before compilation");
- /* XXX Ideally this should be done only for r3xx, but since
- * we don't have branching support for r5xx, we use the emulation
- * on all chipsets. */
+ if (compiler->Base.is_r500)
+ rc_transform_loops(&compiler->Base, &loop_state, R500_VS_MAX_ALU);
+ else
+ rc_transform_loops(&compiler->Base, &loop_state, R300_VS_MAX_ALU);
- if (compiler->Base.is_r500){
- rc_transform_loops(&compiler->Base, &loop_state);
- rc_emulate_loops(&loop_state, R500_VS_MAX_ALU);
- } else {
- rc_transform_loops(&compiler->Base, &loop_state);
- rc_emulate_loops(&loop_state, R300_VS_MAX_ALU);
- }
debug_program_log(compiler, "after emulate loops");
rc_emulate_branches(&compiler->Base);
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c
index 5800f1a..0fabb9c 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c
@@ -177,4 +177,18 @@ void r300_vertex_program_dump(struct r300_vertex_program_code * vs)
r300_vs_src_dump(vs->body.d[offset+1+src]);
}
}
+
+ fprintf(stderr, "Flow Control Ops: 0x%08x\n",vs->fc_ops);
+ for(i = 0; i < vs->num_fc_ops; i++) {
+ switch((vs->fc_ops >> (i * 2)) & 0x3 ) {
+ case 0: fprintf(stderr, "NOP"); break;
+ case 1: fprintf(stderr, "JUMP"); break;
+ case 2: fprintf(stderr, "LOOP"); break;
+ case 3: fprintf(stderr, "JSR"); break;
+ }
+
+ fprintf(stderr,": lw-> 0x%08x uw-> 0x%08x\n",
+ vs->fc_op_addrs[i].lw,
+ vs->fc_op_addrs[i].uw);
+ }
}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_code.h b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
index e14a352..b5b8370 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_code.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
@@ -243,6 +243,12 @@ struct rX00_fragment_program_code {
#define R500_VS_MAX_ALU 1024
#define R500_VS_MAX_ALU_DWORDS (R500_VS_MAX_ALU * 4)
#define R300_VS_MAX_TEMPS 32
+/* This is the max for all chipsets (r300-r500) */
+#define R300_VS_MAX_FC_OPS 16
+/* The r500 maximum depth is not just for loops, but any combination of loops
+ * and subroutine jumps. */
+#define R500_VS_MAX_FC_DEPTH 8
+#define R300_VS_MAX_LOOP_DEPTH 1
#define VSF_MAX_INPUTS 32
#define VSF_MAX_OUTPUTS 32
@@ -263,6 +269,14 @@ struct r300_vertex_program_code {
uint32_t InputsRead;
uint32_t OutputsWritten;
+
+ unsigned int num_fc_ops;
+ uint32_t fc_ops;
+ struct {
+ uint32_t lw;
+ uint32_t uw;
+ } fc_op_addrs[R300_VS_MAX_FC_OPS];
+ uint32_t fc_loop_index[R300_VS_MAX_FC_OPS];
};
void r300_vertex_program_dump(struct r300_vertex_program_code * vs);
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
index 2a3306f..32d4b45 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
@@ -423,7 +423,8 @@ static int build_loop_info(struct radeon_compiler * c, struct loop_info * loop,
* @param inst A pointer to a BGNLOOP instruction.
* @return 1 for success, 0 for failure
*/
-int transform_loop(struct emulate_loop_state * s, struct rc_instruction * inst)
+static int transform_loop(struct emulate_loop_state * s,
+ struct rc_instruction * inst)
{
struct loop_info * loop;
@@ -435,7 +436,7 @@ int transform_loop(struct emulate_loop_state * s, struct rc_instruction * inst)
if (!build_loop_info(s->C, loop, inst))
return 0;
- if(try_unroll_loop(s->C, loop, -1)){
+ if(try_unroll_loop(s->C, loop, s->prog_inst_limit)){
return 1;
}
@@ -472,12 +473,13 @@ int transform_loop(struct emulate_loop_state * s, struct rc_instruction * inst)
}
void rc_transform_loops(struct radeon_compiler *c,
- struct emulate_loop_state * s)
+ struct emulate_loop_state * s, int prog_inst_limit)
{
struct rc_instruction * ptr;
memset(s, 0, sizeof(struct emulate_loop_state));
s->C = c;
+ s->prog_inst_limit = prog_inst_limit;
for(ptr = s->C->Program.Instructions.Next;
ptr != &s->C->Program.Instructions; ptr = ptr->Next) {
if(ptr->Type == RC_INSTRUCTION_NORMAL &&
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
index 86d91ef..bba1f68 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
@@ -21,10 +21,11 @@ struct emulate_loop_state {
struct loop_info * Loops;
unsigned int LoopCount;
unsigned int LoopReserved;
+ int prog_inst_limit;
};
void rc_transform_loops(struct radeon_compiler *c,
- struct emulate_loop_state * s);
+ struct emulate_loop_state * s, int prog_inst_limit);
void rc_unroll_loops(struct radeon_compiler * c, int prog_inst_limit);
diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h
index f25264b..a967b67 100644
--- a/src/mesa/drivers/dri/r300/r300_reg.h
+++ b/src/mesa/drivers/dri/r300/r300_reg.h
@@ -441,6 +441,12 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
#define R300_VAP_GB_HORZ_CLIP_ADJ 0x2228
#define R300_VAP_GB_HORZ_DISC_ADJ 0x222c
+#define R300_VAP_PVS_FLOW_CNTL_0 0x2230
+#define R300_PVS_FC_ACT_ADRS(x) ((x) << 0)
+#define R300_PVS_FC_LOOP_CNT_JMP_INST(x) ((x) << 8)
+#define R300_PVS_FC_LAST_INST(x) ((x) << 16)
+#define R300_PVS_FC_RTN_INST(x) ((x) << 24)
+
/* gap */
/* Sometimes, END_OF_PKT and 0x2284=0 are the only commands sent between
@@ -459,6 +465,10 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
# define R300_2288_R300 0x00750000 /* -- nh */
# define R300_2288_RV350 0x0000FFFF /* -- Vladimir */
+#define R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0 0x2290
+#define R300_PVS_FC_LOOP_INIT_VAL(x) ((x) << 0)
+#define R300_PVS_FC_LOOP_STEP_VAL(x) ((x) << 8)
+
/* gap */
/* Addresses are relative to the vertex program instruction area of the
@@ -489,6 +499,9 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
#define R300_VAP_PVS_CODE_CNTL_1 0x22D8
# define R300_PVS_LAST_VTX_SRC_INST_SHIFT 0
#define R300_VAP_PVS_FLOW_CNTL_OPC 0x22DC
+#define R300_VAP_PVS_FC_OPC_JUMP(x) (1 << (2 * (x)))
+#define R300_VAP_PVS_FC_OPC_LOOP(x) (2 << (2 * (x)))
+#define R300_VAP_PVS_FC_OPC_JSR(x) (3 << (2 * (x)))
/* The entire range from 0x2300 to 0x2AC inclusive seems to be used for
* immediate vertices
@@ -505,6 +518,14 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
/* write 0 to indicate end of packet? */
#define R300_VAP_VTX_END_OF_PKT 0x24AC
+#define R500_VAP_PVS_FLOW_CNTL_ADDRS_LW_0 0x2500
+#define R500_PVS_FC_ACT_ADRS(x) ((x) << 0)
+#define R500_PVS_FC_LOOP_CNT_JMP_INST(x) ((x) << 16)
+
+#define R500_VAP_PVS_FLOW_CNTL_ADDRS_UW_0 0x2504
+#define R500_PVS_FC_LAST_INST(x) ((x) << 0)
+#define R500_PVS_FC_RTN_INST(x) ((x) << 16)
+
/* gap */
/* These are values from r300_reg/r300_reg.h - they are known to be correct
--
1.7.1
--EVF5PPMfhYS0aIcm--
More information about the mesa-dev
mailing list