[Mesa-dev] [PATCH] r600g: fixup AR handling (v4)
Dave Airlie
airlied at gmail.com
Thu Jan 19 03:13:47 PST 2012
From: Dave Airlie <airlied at redhat.com>
So it appears R600s (except rv670) do AR handling different using a different
opcode. This patch fixes up r600g to work properly on r600.
This fixes ~100 piglit tests here (in GLSL1.30 mode) on rv610.
v3: add index_mode as per the docs.
This still fails any dst relative tests for some reason I can't quite see yet,
but it passes a lot more tests than without.
v4: add a nop after dst.rel this could be improved using a second pass,
where we only insert nops if two instructions are sure to collide.
Signed-off-by: Dave Airlie <airlied at redhat.com>
---
src/gallium/drivers/r600/r600_asm.c | 82 ++++++++++++++++++++++++++++++--
src/gallium/drivers/r600/r600_asm.h | 9 +++-
src/gallium/drivers/r600/r600_shader.c | 2 +-
src/gallium/drivers/r600/r600_sq.h | 7 +++
4 files changed, 93 insertions(+), 7 deletions(-)
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 8234744..16c1143 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -94,6 +94,7 @@ static inline unsigned int r600_bytecode_get_num_operands(struct r600_bytecode *
case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA:
case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR:
+ case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT:
case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
@@ -249,8 +250,18 @@ static struct r600_bytecode_tex *r600_bytecode_tex(void)
return tex;
}
-void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class)
+void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class, enum radeon_family family)
{
+ if ((chip_class == R600) && (family != CHIP_RV670))
+ bc->ar_handling = AR_HANDLE_RV6XX;
+ else
+ bc->ar_handling = AR_HANDLE_NORMAL;
+
+ if ((chip_class == R600) && (family != CHIP_RV670 && family != CHIP_RS780 &&
+ family != CHIP_RS880))
+ bc->r6xx_nop_after_rel_dst = 1;
+ else
+ bc->r6xx_nop_after_rel_dst = 0;
LIST_INITHEAD(&bc->cf);
bc->chip_class = chip_class;
}
@@ -441,7 +452,8 @@ static int is_alu_mova_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *
return !alu->is_op3 && (
alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA ||
alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR ||
- alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
+ alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT ||
+ alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT);
case EVERGREEN:
case CAYMAN:
default:
@@ -457,7 +469,8 @@ static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_a
case R600:
case R700:
return is_alu_reduction_inst(bc, alu) ||
- is_alu_mova_inst(bc, alu);
+ (is_alu_mova_inst(bc, alu) &&
+ (alu->inst != V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT));
case EVERGREEN:
case CAYMAN:
default:
@@ -478,6 +491,7 @@ static int is_alu_trans_unit_inst(struct r600_bytecode *bc, struct r600_bytecode
case R700:
if (!alu->is_op3)
return alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT ||
+ alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT ||
alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT ||
alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT ||
alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT ||
@@ -1048,6 +1062,10 @@ static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu
alu = slots[i];
num_once_inst += is_alu_once_inst(bc, alu);
+ /* don't reschedule NOPs */
+ if (alu->inst == BC_INST(bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP))
+ return 0;
+
/* Let's check dst gpr. */
if (alu->dst.rel) {
if (have_mova)
@@ -1236,12 +1254,60 @@ static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, struct r60
return 0;
}
+static int insert_nop_r6xx(struct r600_bytecode *bc)
+{
+ struct r600_bytecode_alu alu;
+ int r, i;
+
+ for (i = 0; i < 4; i++) {
+ memset(&alu, 0, sizeof(alu));
+ alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP;
+ alu.src[0].chan = i;
+ alu.dst.chan = i;
+ alu.last = (i == 3);
+ r = r600_bytecode_add_alu(bc, &alu);
+ if (r)
+ return r;
+ }
+ return 0;
+}
+
+/* load AR register from gpr (bc->ar_reg) with MOVA_INT */
+static int load_ar_r6xx(struct r600_bytecode *bc)
+{
+ struct r600_bytecode_alu alu;
+ int r;
+
+ if (bc->ar_loaded)
+ return 0;
+
+ /* hack to avoid making MOVA the last instruction in the clause */
+ if ((bc->cf_last->ndw>>1) >= 110)
+ bc->force_add_cf = 1;
+
+ memset(&alu, 0, sizeof(alu));
+ alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT;
+ alu.src[0].sel = bc->ar_reg;
+ alu.last = 1;
+ alu.index_mode = INDEX_MODE_LOOP;
+ r = r600_bytecode_add_alu(bc, &alu);
+ if (r)
+ return r;
+
+ /* no requirement to set uses waterfall on MOVA_GPR_INT */
+ bc->ar_loaded = 1;
+ return 0;
+}
+
/* load AR register from gpr (bc->ar_reg) with MOVA_INT */
static int load_ar(struct r600_bytecode *bc)
{
struct r600_bytecode_alu alu;
int r;
+ if (bc->ar_handling)
+ return load_ar_r6xx(bc);
+
if (bc->ar_loaded)
return 0;
@@ -1376,6 +1442,10 @@ int r600_bytecode_add_alu_type(struct r600_bytecode *bc, const struct r600_bytec
bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
bc->cf_last->curr_bs_head = NULL;
}
+
+ if (nalu->dst.rel && bc->r6xx_nop_after_rel_dst)
+ insert_nop_r6xx(bc);
+
return 0;
}
@@ -1599,6 +1669,7 @@ static int r600_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecod
S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
+ S_SQ_ALU_WORD0_INDEX_MODE(alu->index_mode) |
S_SQ_ALU_WORD0_LAST(alu->last);
if (alu->is_op3) {
@@ -2286,7 +2357,8 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
fprintf(stderr, "SRC1(SEL:%d ", alu->src[1].sel);
fprintf(stderr, "REL:%d ", alu->src[1].rel);
fprintf(stderr, "CHAN:%d ", alu->src[1].chan);
- fprintf(stderr, "NEG:%d) ", alu->src[1].neg);
+ fprintf(stderr, "NEG:%d ", alu->src[1].neg);
+ fprintf(stderr, "IM:%d) ", alu->index_mode);
fprintf(stderr, "LAST:%d)\n", alu->last);
id++;
fprintf(stderr, "%04d %08X %c ", id, bc->bytecode[id], alu->last ? '*' : ' ');
@@ -2565,7 +2637,7 @@ int r600_vertex_elements_build_fetch_shader(struct r600_pipe_context *rctx, stru
}
memset(&bc, 0, sizeof(bc));
- r600_bytecode_init(&bc, rctx->chip_class);
+ r600_bytecode_init(&bc, rctx->chip_class, rctx->family);
for (i = 0; i < ve->count; i++) {
if (elements[i].instance_divisor > 1) {
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
index d0ff75d..00f7e59 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -54,6 +54,7 @@ struct r600_bytecode_alu {
unsigned bank_swizzle;
unsigned bank_swizzle_force;
unsigned omod;
+ unsigned index_mode;
};
struct r600_bytecode_tex {
@@ -176,6 +177,10 @@ struct r600_cf_callstack {
int max;
};
+#define AR_HANDLE_NORMAL 0
+#define AR_HANDLE_RV6XX 1 /* except RV670 */
+
+
struct r600_bytecode {
enum chip_class chip_class;
int type;
@@ -194,13 +199,15 @@ struct r600_bytecode {
struct r600_cf_callstack callstack[SQ_MAX_CALL_DEPTH];
unsigned ar_loaded;
unsigned ar_reg;
+ unsigned ar_handling;
+ unsigned r6xx_nop_after_rel_dst;
};
/* eg_asm.c */
int eg_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf);
/* r600_asm.c */
-void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class);
+void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class, enum radeon_family family);
void r600_bytecode_clear(struct r600_bytecode *bc);
int r600_bytecode_add_alu(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu);
int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx);
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 59d41cf..5819c2b 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -807,7 +807,7 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi
ctx.bc = &shader->bc;
ctx.shader = shader;
- r600_bytecode_init(ctx.bc, rctx->chip_class);
+ r600_bytecode_init(ctx.bc, rctx->chip_class, rctx->family);
ctx.tokens = tokens;
tgsi_scan_shader(tokens, &ctx.info);
tgsi_parse_init(&ctx.parse, tokens);
diff --git a/src/gallium/drivers/r600/r600_sq.h b/src/gallium/drivers/r600/r600_sq.h
index b9c4126..4b2a19a 100644
--- a/src/gallium/drivers/r600/r600_sq.h
+++ b/src/gallium/drivers/r600/r600_sq.h
@@ -471,4 +471,11 @@
#define SQ_ALU_SCL_122 0x00000001
#define SQ_ALU_SCL_212 0x00000002
#define SQ_ALU_SCL_221 0x00000003
+
+#define INDEX_MODE_AR_X 0
+#define INDEX_MODE_AR_Y 1
+#define INDEX_MODE_AR_Z 2
+#define INDEX_MODE_AR_W 3
+#define INDEX_MODE_LOOP 4
+
#endif
--
1.7.7.4
More information about the mesa-dev
mailing list