[Mesa-dev] [PATCH] r600g: fixup AR handling (v4)

Dave Airlie airlied at gmail.com
Thu Jan 19 03:13:47 PST 2012


From: Dave Airlie <airlied at redhat.com>

So it appears R600s (except rv670) do AR handling different using a different
opcode. This patch fixes up r600g to work properly on r600.

This fixes ~100 piglit tests here (in GLSL1.30 mode) on rv610.

v3: add index_mode as per the docs.

This still fails any dst relative tests for some reason I can't quite see yet,
but it passes a lot more tests than without.

v4: add a nop after dst.rel this could be improved using a second pass,
where we only insert nops if two instructions are sure to collide.

Signed-off-by: Dave Airlie <airlied at redhat.com>
---
 src/gallium/drivers/r600/r600_asm.c    |   82 ++++++++++++++++++++++++++++++--
 src/gallium/drivers/r600/r600_asm.h    |    9 +++-
 src/gallium/drivers/r600/r600_shader.c |    2 +-
 src/gallium/drivers/r600/r600_sq.h     |    7 +++
 4 files changed, 93 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 8234744..16c1143 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -94,6 +94,7 @@ static inline unsigned int r600_bytecode_get_num_operands(struct r600_bytecode *
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR:
+		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
@@ -249,8 +250,18 @@ static struct r600_bytecode_tex *r600_bytecode_tex(void)
 	return tex;
 }
 
-void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class)
+void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class, enum radeon_family family)
 {
+	if ((chip_class == R600) && (family != CHIP_RV670))
+		bc->ar_handling = AR_HANDLE_RV6XX;
+	else
+		bc->ar_handling = AR_HANDLE_NORMAL;
+
+	if ((chip_class == R600) && (family != CHIP_RV670 && family != CHIP_RS780 &&
+					   family != CHIP_RS880))
+		bc->r6xx_nop_after_rel_dst = 1;
+	else
+		bc->r6xx_nop_after_rel_dst = 0;
 	LIST_INITHEAD(&bc->cf);
 	bc->chip_class = chip_class;
 }
@@ -441,7 +452,8 @@ static int is_alu_mova_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *
 		return !alu->is_op3 && (
 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA ||
 			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR ||
-			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
+			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT ||
+			alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT);
 	case EVERGREEN:
 	case CAYMAN:
 	default:
@@ -457,7 +469,8 @@ static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_a
 	case R600:
 	case R700:
 		return is_alu_reduction_inst(bc, alu) ||
-			is_alu_mova_inst(bc, alu);
+			(is_alu_mova_inst(bc, alu) && 
+			 (alu->inst != V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT));
 	case EVERGREEN:
 	case CAYMAN:
 	default:
@@ -478,6 +491,7 @@ static int is_alu_trans_unit_inst(struct r600_bytecode *bc, struct r600_bytecode
 	case R700:
 		if (!alu->is_op3)
 			return alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT ||
+				alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT ||
 				alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT ||
 			        alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT ||
 				alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT ||
@@ -1048,6 +1062,10 @@ static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu
 		alu = slots[i];
 		num_once_inst += is_alu_once_inst(bc, alu);
 
+		/* don't reschedule NOPs */
+		if (alu->inst == BC_INST(bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP))
+			return 0;
+
 		/* Let's check dst gpr. */
 		if (alu->dst.rel) {
 			if (have_mova)
@@ -1236,12 +1254,60 @@ static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, struct r60
 	return 0;
 }
 
+static int insert_nop_r6xx(struct r600_bytecode *bc)
+{
+	struct r600_bytecode_alu alu;
+	int r, i;
+
+	for (i = 0; i < 4; i++) {
+		memset(&alu, 0, sizeof(alu));
+		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP;
+		alu.src[0].chan = i;
+		alu.dst.chan = i;
+		alu.last = (i == 3);
+		r = r600_bytecode_add_alu(bc, &alu);
+		if (r)
+			return r;
+	}
+	return 0;
+}
+
+/* load AR register from gpr (bc->ar_reg) with MOVA_INT */
+static int load_ar_r6xx(struct r600_bytecode *bc)
+{
+	struct r600_bytecode_alu alu;
+	int r;
+
+	if (bc->ar_loaded)
+		return 0;
+
+	/* hack to avoid making MOVA the last instruction in the clause */
+	if ((bc->cf_last->ndw>>1) >= 110)
+		bc->force_add_cf = 1;
+
+	memset(&alu, 0, sizeof(alu));
+	alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT;
+	alu.src[0].sel = bc->ar_reg;
+	alu.last = 1;
+	alu.index_mode = INDEX_MODE_LOOP;
+	r = r600_bytecode_add_alu(bc, &alu);
+	if (r)
+		return r;
+
+	/* no requirement to set uses waterfall on MOVA_GPR_INT */
+	bc->ar_loaded = 1;
+	return 0;
+}
+
 /* load AR register from gpr (bc->ar_reg) with MOVA_INT */
 static int load_ar(struct r600_bytecode *bc)
 {
 	struct r600_bytecode_alu alu;
 	int r;
 
+	if (bc->ar_handling)
+		return load_ar_r6xx(bc);
+
 	if (bc->ar_loaded)
 		return 0;
 
@@ -1376,6 +1442,10 @@ int r600_bytecode_add_alu_type(struct r600_bytecode *bc, const struct r600_bytec
 		bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
 		bc->cf_last->curr_bs_head = NULL;
 	}
+
+	if (nalu->dst.rel && bc->r6xx_nop_after_rel_dst)
+		insert_nop_r6xx(bc);
+
 	return 0;
 }
 
@@ -1599,6 +1669,7 @@ static int r600_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecod
 				S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
 				S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
 				S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
+				S_SQ_ALU_WORD0_INDEX_MODE(alu->index_mode) |
 				S_SQ_ALU_WORD0_LAST(alu->last);
 
 	if (alu->is_op3) {
@@ -2286,7 +2357,8 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
 			fprintf(stderr, "SRC1(SEL:%d ", alu->src[1].sel);
 			fprintf(stderr, "REL:%d ", alu->src[1].rel);
 			fprintf(stderr, "CHAN:%d ", alu->src[1].chan);
-			fprintf(stderr, "NEG:%d) ", alu->src[1].neg);
+			fprintf(stderr, "NEG:%d ", alu->src[1].neg);
+			fprintf(stderr, "IM:%d) ", alu->index_mode);
 			fprintf(stderr, "LAST:%d)\n", alu->last);
 			id++;
 			fprintf(stderr, "%04d %08X %c ", id, bc->bytecode[id], alu->last ? '*' : ' ');
@@ -2565,7 +2637,7 @@ int r600_vertex_elements_build_fetch_shader(struct r600_pipe_context *rctx, stru
 	}
 
 	memset(&bc, 0, sizeof(bc));
-	r600_bytecode_init(&bc, rctx->chip_class);
+	r600_bytecode_init(&bc, rctx->chip_class, rctx->family);
 
 	for (i = 0; i < ve->count; i++) {
 		if (elements[i].instance_divisor > 1) {
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
index d0ff75d..00f7e59 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -54,6 +54,7 @@ struct r600_bytecode_alu {
 	unsigned			bank_swizzle;
 	unsigned			bank_swizzle_force;
 	unsigned			omod;
+	unsigned                        index_mode;
 };
 
 struct r600_bytecode_tex {
@@ -176,6 +177,10 @@ struct r600_cf_callstack {
 	int				max;
 };
 
+#define AR_HANDLE_NORMAL 0
+#define AR_HANDLE_RV6XX 1 /* except RV670 */
+
+
 struct r600_bytecode {
 	enum chip_class			chip_class;
 	int				type;
@@ -194,13 +199,15 @@ struct r600_bytecode {
 	struct r600_cf_callstack	callstack[SQ_MAX_CALL_DEPTH];
 	unsigned	ar_loaded;
 	unsigned	ar_reg;
+	unsigned        ar_handling;
+	unsigned        r6xx_nop_after_rel_dst;
 };
 
 /* eg_asm.c */
 int eg_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf);
 
 /* r600_asm.c */
-void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class);
+void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class, enum radeon_family family);
 void r600_bytecode_clear(struct r600_bytecode *bc);
 int r600_bytecode_add_alu(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu);
 int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx);
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 59d41cf..5819c2b 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -807,7 +807,7 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi
 
 	ctx.bc = &shader->bc;
 	ctx.shader = shader;
-	r600_bytecode_init(ctx.bc, rctx->chip_class);
+	r600_bytecode_init(ctx.bc, rctx->chip_class, rctx->family);
 	ctx.tokens = tokens;
 	tgsi_scan_shader(tokens, &ctx.info);
 	tgsi_parse_init(&ctx.parse, tokens);
diff --git a/src/gallium/drivers/r600/r600_sq.h b/src/gallium/drivers/r600/r600_sq.h
index b9c4126..4b2a19a 100644
--- a/src/gallium/drivers/r600/r600_sq.h
+++ b/src/gallium/drivers/r600/r600_sq.h
@@ -471,4 +471,11 @@
 #define SQ_ALU_SCL_122                           0x00000001
 #define SQ_ALU_SCL_212                           0x00000002
 #define SQ_ALU_SCL_221                           0x00000003
+
+#define   INDEX_MODE_AR_X 0
+#define   INDEX_MODE_AR_Y 1
+#define   INDEX_MODE_AR_Z 2
+#define   INDEX_MODE_AR_W 3
+#define   INDEX_MODE_LOOP 4
+
 #endif
-- 
1.7.7.4



More information about the mesa-dev mailing list