[Mesa-dev] [PATCH] r600g/sb: Support gs5 sampler indexing

Mon Sep 21 07:21:37 PDT 2015

Signed-off-by: Glenn Kennard <glenn.kennard at gmail.com>
---
Just UBO support left before gs5 can be enabled.
Could improve how the two index registers are set/used to reduce
the number of clauses, but as is its about as good as what the blob
emits.

 src/gallium/drivers/r600/r600_shader.c       |  12 ++-
 src/gallium/drivers/r600/r600_shader.h       |   4 +-
 src/gallium/drivers/r600/sb/sb_bc.h          |  10 ++-
 src/gallium/drivers/r600/sb/sb_bc_dump.cpp   |  17 +++-
 src/gallium/drivers/r600/sb/sb_bc_parser.cpp |  50 +++++++++++-
 src/gallium/drivers/r600/sb/sb_gcm.cpp       |  11 ++-
 src/gallium/drivers/r600/sb/sb_sched.cpp     | 118 +++++++++++++++++++++++++--
 src/gallium/drivers/r600/sb/sb_sched.h       |   5 +-
 8 files changed, 201 insertions(+), 26 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 1d90582..24c3d43 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -166,8 +166,8 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
     if (rctx->b.chip_class <= R700) {
 	    use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_GEOMETRY);
     }
-	/* disable SB for shaders using CF_INDEX_0/1 (sampler/ubo array indexing) as it doesn't handle those currently */
-	use_sb &= !shader->shader.uses_index_registers;
+	/* disable SB for shaders using ubo array indexing as it doesn't handle those currently */
+	use_sb &= !shader->shader.uses_ubo_indexing;
 	/* disable SB for shaders using doubles */
 	use_sb &= !shader->shader.uses_doubles;
 
@@ -1251,7 +1251,7 @@ static int tgsi_split_constant(struct r600_shader_ctx *ctx)
 		}
 
 		if (ctx->src[i].kc_rel)
-			ctx->shader->uses_index_registers = true;
+			ctx->shader->uses_ubo_indexing = true;
 
 		if (ctx->src[i].rel) {
 			int chan = inst->Src[i].Indirect.Swizzle;
@@ -1912,7 +1912,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 
 	shader->uses_doubles = ctx.info.uses_doubles;
 
-	indirect_gprs = ctx.info.indirect_files & ~(1 << TGSI_FILE_CONSTANT);
+	indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
 	tgsi_parse_init(&ctx.parse, tokens);
 	ctx.type = ctx.info.processor;
 	shader->processor_type = ctx.type;
@@ -1936,7 +1936,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	ctx.gs_next_vertex = 0;
 	ctx.gs_stream_output_info = &so;
 
-	shader->uses_index_registers = false;
+	shader->uses_ubo_indexing = false;
 	ctx.face_gpr = -1;
 	ctx.fixed_pt_position_gpr = -1;
 	ctx.fragcoord_input = -1;
@@ -5703,8 +5703,6 @@ static int tgsi_tex(struct r600_shader_ctx *ctx)
 		sampler_src_reg = 3;
 
 	sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
-	if (sampler_index_mode)
-		ctx->shader->uses_index_registers = true;
 
 	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
 
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
index 48de9cd..8ba32ae 100644
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -75,8 +75,8 @@ struct r600_shader {
 	boolean			has_txq_cube_array_z_comp;
 	boolean			uses_tex_buffers;
 	boolean                 gs_prim_id_input;
-	/* Temporarily workaround SB not handling CF_INDEX_[01] index registers */
-	boolean			uses_index_registers;
+	/* Temporarily workaround SB not handling ubo indexing */
+	boolean			uses_ubo_indexing;
 
 	/* Size in bytes of a data item in the ring(s) (single vertex data).
 	   Stages with only one ring items 123 will be set to 0. */
diff --git a/src/gallium/drivers/r600/sb/sb_bc.h b/src/gallium/drivers/r600/sb/sb_bc.h
index ab988f8..126750d 100644
--- a/src/gallium/drivers/r600/sb/sb_bc.h
+++ b/src/gallium/drivers/r600/sb/sb_bc.h
@@ -48,6 +48,7 @@ class fetch_node;
 class alu_group_node;
 class region_node;
 class shader;
+class value;
 
 class sb_ostream {
 public:
@@ -818,13 +819,16 @@ class bc_parser {
 
 	bool gpr_reladdr;
 
+	// Note: currently relies on input emitting SET_CF in same basic block as uses
+	value *cf_index_value[2];
+	alu_node *mova;
 public:
 
 	bc_parser(sb_context &sctx, r600_bytecode *bc, r600_shader* pshader) :
 		ctx(sctx), dec(), bc(bc), pshader(pshader),
 		dw(), bc_ndw(), max_cf(),
 		sh(), error(), slots(), cgroup(),
-		cf_map(), loop_stack(), gpr_reladdr() { }
+		cf_map(), loop_stack(), gpr_reladdr(), cf_index_value(), mova() { }
 
 	int decode();
 	int prepare();
@@ -852,6 +856,10 @@ private:
 	int prepare_loop(cf_node *c);
 	int prepare_if(cf_node *c);
 
+	void save_set_cf_index(value *val, unsigned idx);
+	value *get_cf_index_value(unsigned idx);
+	void save_mova(alu_node *mova);
+	alu_node *get_mova();
 };
 
 
diff --git a/src/gallium/drivers/r600/sb/sb_bc_dump.cpp b/src/gallium/drivers/r600/sb/sb_bc_dump.cpp
index 0fc73c4..3c70ea7 100644
--- a/src/gallium/drivers/r600/sb/sb_bc_dump.cpp
+++ b/src/gallium/drivers/r600/sb/sb_bc_dump.cpp
@@ -27,6 +27,7 @@
 #include "sb_bc.h"
 #include "sb_shader.h"
 #include "sb_pass.h"
+#include "eg_sq.h" // V_SQ_CF_INDEX_0/1
 
 namespace r600_sb {
 
@@ -354,6 +355,14 @@ void bc_dump::dump(alu_node& n) {
 			s << "  " << vec_bs[n.bc.bank_swizzle];
 	}
 
+	if (ctx.is_cayman()) {
+		if (n.bc.op == ALU_OP1_MOVA_INT) {
+			static const char *mova_str[] = { " AR_X", " PC", " CF_IDX0", " CF_IDX1",
+				" Unknown MOVA_INT dest" };
+			s << mova_str[std::min(n.bc.dst_gpr, 4u)];  // CM_V_SQ_MOVA_DST_AR_*
+		}
+	}
+
 	sblog << s.str() << "\n";
 }
 
@@ -450,9 +459,9 @@ void bc_dump::dump(fetch_node& n) {
 		if (n.bc.fetch_whole_quad)
 			s << " FWQ";
 		if (ctx.is_egcm() && n.bc.resource_index_mode)
-			s << " RIM:SQ_CF_INDEX_" << n.bc.resource_index_mode;
+			s << " RIM:SQ_CF_INDEX_" << (n.bc.resource_index_mode - V_SQ_CF_INDEX_0);
 		if (ctx.is_egcm() && n.bc.sampler_index_mode)
-			s << " SID:SQ_CF_INDEX_" << n.bc.sampler_index_mode;
+			s << " SID:SQ_CF_INDEX_" << (n.bc.sampler_index_mode - V_SQ_CF_INDEX_0);
 
 		s << " UCF:" << n.bc.use_const_fields
 				<< " FMT(DTA:" << n.bc.data_format
@@ -470,9 +479,9 @@ void bc_dump::dump(fetch_node& n) {
 			if (n.bc.offset[k])
 				s << " O" << chans[k] << ":" << n.bc.offset[k];
 		if (ctx.is_egcm() && n.bc.resource_index_mode)
-			s << " RIM:SQ_CF_INDEX_" << n.bc.resource_index_mode;
+			s << " RIM:SQ_CF_INDEX_" << (n.bc.resource_index_mode - V_SQ_CF_INDEX_0);
 		if (ctx.is_egcm() && n.bc.sampler_index_mode)
-			s << " SID:SQ_CF_INDEX_" << n.bc.sampler_index_mode;
+			s << " SID:SQ_CF_INDEX_" << (n.bc.sampler_index_mode - V_SQ_CF_INDEX_0);
 	}
 
 	sblog << s.str() << "\n";
diff --git a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
index 19bd078..eb43670 100644
--- a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
+++ b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
@@ -34,6 +34,7 @@
 
 #include "r600_pipe.h"
 #include "r600_shader.h"
+#include "eg_sq.h" // CM_V_SQ_MOVA_DST_CF_IDX0/1
 
 #include <stack>
 
@@ -121,7 +122,7 @@ int bc_parser::parse_decls() {
 		return 0;
 	}
 
-	if (pshader->indirect_files & ~(1 << TGSI_FILE_CONSTANT)) {
+	if (pshader->indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER))) {
 
 		assert(pshader->num_arrays);
 
@@ -328,6 +329,28 @@ int bc_parser::prepare_alu_clause(cf_node* cf) {
 	return 0;
 }
 
+void bc_parser::save_set_cf_index(value *val, unsigned idx)
+{
+	assert(idx <= 1);
+	assert(val);
+	cf_index_value[idx] = val;
+}
+value *bc_parser::get_cf_index_value(unsigned idx)
+{
+	assert(idx <= 1);
+	return cf_index_value[idx];
+}
+void bc_parser::save_mova(alu_node *mova)
+{
+	assert(mova);
+	this->mova = mova;
+}
+alu_node *bc_parser::get_mova()
+{
+	assert(mova);
+	return mova;
+}
+
 int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) {
 
 	alu_node *n;
@@ -375,9 +398,24 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) {
 			n->dst.resize(1);
 		}
 
-		if (flags & AF_MOVA) {
-
-			n->dst[0] = sh->get_special_value(SV_AR_INDEX);
+		if (n->bc.op == ALU_OP0_SET_CF_IDX0 || n->bc.op == ALU_OP0_SET_CF_IDX1) {
+			// Move CF_IDX value into tex instruction operands, scheduler will later re-emit setting of CF_IDX
+			// DCE will kill this op
+			save_set_cf_index(get_mova()->src[0], n->bc.op == ALU_OP0_SET_CF_IDX1);
+		} else if (flags & AF_MOVA) {
+
+			if ((n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX0 ||
+				n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1) &&
+				ctx.is_cayman())
+			{
+				// Move CF_IDX value into tex instruction operands, scheduler will later re-emit setting of CF_IDX
+				save_set_cf_index(n->src[0], n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1);
+				n->dst.resize(0);
+			}
+			else {
+				n->dst[0] = sh->get_special_value(SV_AR_INDEX);
+				save_mova(n);
+			}
 
 			n->flags |= NF_DONT_HOIST;
 
@@ -608,6 +646,10 @@ int bc_parser::prepare_fetch_clause(cf_node *cf) {
 					                              n->bc.src_sel[s], false);
 			}
 
+			// Scheduler will emit the appropriate instructions to set CF_IDX0/1
+			if (n->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE) {
+				n->src.push_back(get_cf_index_value(n->bc.sampler_index_mode == V_SQ_CF_INDEX_1));
+			}
 		}
 	}
 
diff --git a/src/gallium/drivers/r600/sb/sb_gcm.cpp b/src/gallium/drivers/r600/sb/sb_gcm.cpp
index bccb671..236b2ea 100644
--- a/src/gallium/drivers/r600/sb/sb_gcm.cpp
+++ b/src/gallium/drivers/r600/sb/sb_gcm.cpp
@@ -37,6 +37,7 @@
 #include "sb_bc.h"
 #include "sb_shader.h"
 #include "sb_pass.h"
+#include "eg_sq.h" // V_SQ_CF_INDEX_NONE
 
 namespace r600_sb {
 
@@ -406,6 +407,14 @@ void gcm::bu_sched_bb(bb_node* bb) {
 					ncnt = 3;
 				}
 
+				bool sampler_indexing = false;
+				if (n->is_fetch_inst() &&
+					static_cast<fetch_node *>(n)->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE)
+				{
+					sampler_indexing = true; // Give sampler indexed ops get their own clause
+					ncnt = sh.get_ctx().is_cayman() ? 2 : 3; // MOVA + SET_CF_IDX0/1
+				}
+
 				if ((sq == SQ_TEX || sq == SQ_VTX) &&
 						((last_count >= ctx.max_fetch/2 &&
 						check_alu_ready_count(24)) ||
@@ -418,7 +427,7 @@ void gcm::bu_sched_bb(bb_node* bb) {
 				bu_ready[sq].pop_front();
 
 				if (sq != SQ_CF) {
-					if (!clause) {
+					if (!clause || sampler_indexing) {
 						clause = sh.create_clause(sq == SQ_ALU ?
 								NST_ALU_CLAUSE :
 									sq == SQ_TEX ? NST_TEX_CLAUSE :
diff --git a/src/gallium/drivers/r600/sb/sb_sched.cpp b/src/gallium/drivers/r600/sb/sb_sched.cpp
index c98b8ff..601445f 100644
--- a/src/gallium/drivers/r600/sb/sb_sched.cpp
+++ b/src/gallium/drivers/r600/sb/sb_sched.cpp
@@ -36,6 +36,7 @@
 #include "sb_shader.h"
 #include "sb_pass.h"
 #include "sb_sched.h"
+#include "eg_sq.h" // V_SQ_CF_INDEX_NONE/0/1
 
 namespace r600_sb {
 
@@ -781,7 +782,14 @@ void post_scheduler::schedule_bb(bb_node* bb) {
 			sblog << "\n";
 		);
 
-		if (n->subtype == NST_ALU_CLAUSE) {
+		// May require emitting ALU ops to load index registers
+		if (n->is_fetch_clause()) {
+			n->remove();
+			process_fetch(static_cast<container_node *>(n));
+			continue;
+		}
+
+		if (n->is_alu_clause()) {
 			n->remove();
 			process_alu(static_cast<container_node*>(n));
 			continue;
@@ -823,6 +831,102 @@ void post_scheduler::init_regmap() {
 	}
 }
 
+static alu_node *create_set_idx(shader &sh, unsigned ar_idx) {
+	alu_node *a = sh.create_alu();
+
+	assert(ar_idx == V_SQ_CF_INDEX_0 || ar_idx == V_SQ_CF_INDEX_1);
+	if (ar_idx == V_SQ_CF_INDEX_0)
+		a->bc.set_op(ALU_OP0_SET_CF_IDX0);
+	else
+		a->bc.set_op(ALU_OP0_SET_CF_IDX1);
+	a->bc.slot = SLOT_X;
+	a->dst.resize(1); // Dummy needed for recolor
+
+	PSC_DUMP(
+		sblog << "created IDX load: "
+		dump::dump_op(a);
+		sblog << "\n";
+	);
+
+	return a;
+}
+
+void post_scheduler::load_index_register(value *v, unsigned ar_idx)
+{
+	alu.reset();
+
+	if (!sh.get_ctx().is_cayman()) {
+		// Evergreen has to first load address register, then use CF_SET_IDX0/1
+		alu_group_tracker &rt = alu.grp();
+		alu_node *set_idx = create_set_idx(sh, ar_idx);
+		if (!rt.try_reserve(set_idx)) {
+			sblog << "can't emit SET_CF_IDX";
+			dump::dump_op(set_idx);
+			sblog << "\n";
+		}
+		process_group();
+
+		if (!alu.check_clause_limits()) {
+			// Can't happen since clause only contains MOVA/CF_SET_IDX0/1
+		}
+		alu.emit_group();
+	}
+
+	alu_group_tracker &rt = alu.grp();
+	alu_node *a = alu.create_ar_load(v, ar_idx == V_SQ_CF_INDEX_1 ? SEL_Z : SEL_Y);
+
+	if (!rt.try_reserve(a)) {
+		sblog << "can't emit AR load : ";
+		dump::dump_op(a);
+		sblog << "\n";
+	}
+
+	process_group();
+
+	if (!alu.check_clause_limits()) {
+		// Can't happen since clause only contains MOVA/CF_SET_IDX0/1
+	}
+
+	alu.emit_group();
+	alu.emit_clause(cur_bb);
+}
+
+void post_scheduler::process_fetch(container_node *c) {
+	if (c->empty())
+		return;
+
+	for (node_iterator N, I = c->begin(), E = c->end(); I != E; I = N) {
+		N = I;
+		++N;
+
+		node *n = *I;
+
+		fetch_node *f = static_cast<fetch_node*>(n);
+
+		PSC_DUMP(
+			sblog << "process_tex ";
+			dump::dump_op(n);
+			sblog << "  ";
+		);
+
+		if (f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE) {
+			// Currently require prior opt passes to use one TEX per indexed op
+			assert(f->parent->count() == 1);
+
+			value *v = f->src.back(); // Last src is index offset
+
+			cur_bb->push_front(c);
+
+			load_index_register(v, f->bc.sampler_index_mode);
+			f->src.pop_back(); // Don't need index value any more
+
+			return;
+		}
+	}
+
+	cur_bb->push_front(c);
+}
+
 void post_scheduler::process_alu(container_node *c) {
 
 	if (c->empty())
@@ -1180,7 +1284,7 @@ void post_scheduler::emit_load_ar() {
 	alu.discard_current_group();
 
 	alu_group_tracker &rt = alu.grp();
-	alu_node *a = alu.create_ar_load();
+	alu_node *a = alu.create_ar_load(alu.current_ar, SEL_X);
 
 	if (!rt.try_reserve(a)) {
 		sblog << "can't emit AR load : ";
@@ -1936,11 +2040,9 @@ bool alu_kcache_tracker::update_kc() {
 	return true;
 }
 
-alu_node* alu_clause_tracker::create_ar_load() {
+alu_node* alu_clause_tracker::create_ar_load(value *v, chan_select ar_channel) {
 	alu_node *a = sh.create_alu();
 
-	// FIXME use MOVA_GPR on R6xx
-
 	if (sh.get_ctx().uses_mova_gpr) {
 		a->bc.set_op(ALU_OP1_MOVA_GPR_INT);
 		a->bc.slot = SLOT_TRANS;
@@ -1948,9 +2050,13 @@ alu_node* alu_clause_tracker::create_ar_load() {
 		a->bc.set_op(ALU_OP1_MOVA_INT);
 		a->bc.slot = SLOT_X;
 	}
+	a->bc.dst_chan = ar_channel;
+	if (ar_channel != SEL_X && sh.get_ctx().is_cayman()) {
+		a->bc.dst_gpr = ar_channel == SEL_Y ? CM_V_SQ_MOVA_DST_CF_IDX0 : CM_V_SQ_MOVA_DST_CF_IDX1;
+	}
 
 	a->dst.resize(1);
-	a->src.push_back(current_ar);
+	a->src.push_back(v);
 
 	PSC_DUMP(
 		sblog << "created AR load: ";
diff --git a/src/gallium/drivers/r600/sb/sb_sched.h b/src/gallium/drivers/r600/sb/sb_sched.h
index 87c4586..2ca7146 100644
--- a/src/gallium/drivers/r600/sb/sb_sched.h
+++ b/src/gallium/drivers/r600/sb/sb_sched.h
@@ -235,7 +235,7 @@ public:
 	void new_group();
 	bool is_empty();
 
-	alu_node* create_ar_load();
+	alu_node* create_ar_load(value *v, chan_select ar_channel);
 
 	void discard_current_group();
 
@@ -266,6 +266,9 @@ public:
 	void run_on(container_node *n);
 	void schedule_bb(bb_node *bb);
 
+	void load_index_register(value *v, unsigned idx);
+	void process_fetch(container_node *c);
+
 	void process_alu(container_node *c);
 	void schedule_alu(container_node *c);
 	bool prepare_alu_group();
-- 
1.9.1