[Mesa-dev] [PATCH] r600g: rework check_and_set_bank_swizzle

Vadim Girlin vadimgirlin at gmail.com
Sun Dec 30 17:34:58 PST 2012


Optimize it for better performance.

Signed-off-by: Vadim Girlin <vadimgirlin at gmail.com>
---

I did some benchmarking for this patch together with the patch that introduces
ISA tables, using the texCombine test (IIRC it builds hundreds of shaders) and
making the driver build each shader 5000 times, total time of the test was
reduced from 45 to 25 seconds, the time spent in check_and_set_bank_swizzle
is about 8 times less according to the oprofile, the time of 
r600_shader_from_tgsi is reduced by half.

You can also find the patch in this branch:

http://cgit.freedesktop.org/~vadimg/mesa/log/?h=r600-disasm

 src/gallium/drivers/r600/r600_asm.c | 398 ++++++++++++++++++------------------
 1 file changed, 201 insertions(+), 197 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 5981916..c88f395 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -31,9 +31,6 @@
 #include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 
-#define NUM_OF_CYCLES 3
-#define NUM_OF_COMPONENTS 4
-
 static inline unsigned int r600_bytecode_get_num_operands(
 		struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
 {
@@ -259,12 +256,6 @@ static int assign_alu_units(struct r600_bytecode *bc, struct r600_bytecode_alu *
 	return 0;
 }
 
-struct alu_bank_swizzle {
-	int	hw_gpr[NUM_OF_CYCLES][NUM_OF_COMPONENTS];
-	int	hw_cfile_addr[4];
-	int	hw_cfile_elem[4];
-};
-
 static const unsigned cycle_for_bank_swizzle_vec[][3] = {
 	[SQ_ALU_VEC_012] = { 0, 1, 2 },
 	[SQ_ALU_VEC_021] = { 0, 2, 1 },
@@ -281,229 +272,242 @@ static const unsigned cycle_for_bank_swizzle_scl[][3] = {
 	[SQ_ALU_SCL_221] = { 2, 2, 1 }
 };
 
-static void init_bank_swizzle(struct alu_bank_swizzle *bs)
-{
-	int i, cycle, component;
-	/* set up gpr use */
-	for (cycle = 0; cycle < NUM_OF_CYCLES; cycle++)
-		for (component = 0; component < NUM_OF_COMPONENTS; component++)
-			 bs->hw_gpr[cycle][component] = -1;
-	for (i = 0; i < 4; i++)
-		bs->hw_cfile_addr[i] = -1;
-	for (i = 0; i < 4; i++)
-		bs->hw_cfile_elem[i] = -1;
-}
-
-static int reserve_gpr(struct alu_bank_swizzle *bs, unsigned sel, unsigned chan, unsigned cycle)
+static inline int is_gpr(unsigned sel)
 {
-	if (bs->hw_gpr[cycle][chan] == -1)
-		bs->hw_gpr[cycle][chan] = sel;
-	else if (bs->hw_gpr[cycle][chan] != (int)sel) {
-		/* Another scalar operation has already used the GPR read port for the channel. */
-		return -1;
-	}
-	return 0;
+	return (sel <= 127);
 }
 
-static int reserve_cfile(struct r600_bytecode *bc, struct alu_bank_swizzle *bs, unsigned sel, unsigned chan)
+/* CB constants start at 512 (until the translation to kcache,
+ * see r600_bytecode_assign_kcache_banks, but we don't use this function
+ * after translation) */
+static inline int is_untranslated_const_sel(unsigned sel)
 {
-	int res, num_res = 4;
-	if (bc->chip_class >= R700) {
-		num_res = 2;
-		chan /= 2;
-	}
-	for (res = 0; res < num_res; ++res) {
-		if (bs->hw_cfile_addr[res] == -1) {
-			bs->hw_cfile_addr[res] = sel;
-			bs->hw_cfile_elem[res] = chan;
-			return 0;
-		} else if (bs->hw_cfile_addr[res] == sel &&
-			bs->hw_cfile_elem[res] == chan)
-			return 0; /* Read for this scalar element already reserved, nothing to do here. */
-	}
-	/* All cfile read ports are used, cannot reference vector element. */
-	return -1;
+	return sel >= 512;
 }
 
-static int is_gpr(unsigned sel)
+/* we don't check for translated kcache sels because we don't use this
+ * function after translation */
+static inline int is_any_const(int sel)
 {
-	return (sel >= 0 && sel <= 127);
-}
-
-/* CB constants start at 512, and get translated to a kcache index when ALU
- * clauses are constructed. Note that we handle kcache constants the same way
- * as (the now gone) cfile constants, is that really required? */
-static int is_cfile(unsigned sel)
-{
-	return (sel > 255 && sel < 512) ||
-		(sel > 511 && sel < 4607) || /* Kcache before translation. */
-		(sel > 127 && sel < 192); /* Kcache after translation. */
-}
-
-static int is_const(int sel)
-{
-	return is_cfile(sel) ||
+	return is_untranslated_const_sel(sel) ||
 		(sel >= V_SQ_ALU_SRC_0 &&
 		sel <= V_SQ_ALU_SRC_LITERAL);
 }
 
-static int check_vector(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
-			struct alu_bank_swizzle *bs, int bank_swizzle)
+static inline int reserve_const_readport(
+		int sel, int chan, int const_readports[4], int is_r600)
 {
-	int r, src, num_src, sel, elem, cycle;
-
-	num_src = r600_bytecode_get_num_operands(bc, alu);
-	for (src = 0; src < num_src; src++) {
-		sel = alu->src[src].sel;
-		elem = alu->src[src].chan;
-		if (is_gpr(sel)) {
-			cycle = cycle_for_bank_swizzle_vec[bank_swizzle][src];
-			if (src == 1 && sel == alu->src[0].sel && elem == alu->src[0].chan)
-				/* Nothing to do; special-case optimization,
-				 * second source uses first source’s reservation. */
-				continue;
-			else {
-				r = reserve_gpr(bs, sel, elem, cycle);
-				if (r)
-					return r;
-			}
-		} else if (is_cfile(sel)) {
-			r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
-			if (r)
-				return r;
+	int port_count, i;
+
+	if (is_r600) {
+		/* we have 4 read ports on R6xx, each can read any const element */
+		port_count = 4;
+		sel = (sel << 2) + chan + 1;
+	} else {
+		/* we have 2 read ports on R700+, each can read pair of elements
+		 * (xy or zw), e.g. we can read C0.xy and C5.xy */
+		port_count = 2;
+		sel = (sel << 1) + (chan >> 1) + 1;
+	}
+
+	for(i = 0; i < port_count; ++i) {
+		if (const_readports[i]) { /* port already allocated */
+			if (const_readports[i] == sel) /* reads same data - we can reuse */
+				return 0;
+		} else {
+			/* port is free - allocate it */
+			const_readports[i] = sel;
+			return 0;
 		}
-		/* No restrictions on PV, PS, literal or special constants. */
 	}
-	return 0;
+
+	return -1;
 }
 
-static int check_scalar(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
-			struct alu_bank_swizzle *bs, int bank_swizzle)
+/* returns cycle index */
+static inline int cycle_for_slot(int slot, int bank_swizzle, int src)
 {
-	int r, src, num_src, const_count, sel, elem, cycle;
-
-	num_src = r600_bytecode_get_num_operands(bc, alu);
-	for (const_count = 0, src = 0; src < num_src; ++src) {
-		sel = alu->src[src].sel;
-		elem = alu->src[src].chan;
-		if (is_const(sel)) { /* Any constant, including literal and inline constants. */
-			if (const_count >= 2)
-				/* More than two references to a constant in
-				 * transcendental operation. */
-				return -1;
-			else
-				const_count++;
-		}
-		if (is_cfile(sel)) {
-			r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
-			if (r)
-				return r;
-		}
-	}
-	for (src = 0; src < num_src; ++src) {
-		sel = alu->src[src].sel;
-		elem = alu->src[src].chan;
-		if (is_gpr(sel)) {
-			cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
-			if (cycle < const_count)
-				/* Cycle for GPR load conflicts with
-				 * constant load in transcendental operation. */
-				return -1;
-			r = reserve_gpr(bs, sel, elem, cycle);
-			if (r)
-				return r;
-		}
-		/* PV PS restrictions */
-		if (const_count && (sel == 254 || sel == 255)) {
-			cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
-			if (cycle < const_count)
-				return -1;
-		}
-	}
-	return 0;
+	if (slot == 4)
+		return cycle_for_bank_swizzle_scl[bank_swizzle][src];
+	else
+		return cycle_for_bank_swizzle_vec[bank_swizzle][src];
 }
 
 static int check_and_set_bank_swizzle(struct r600_bytecode *bc,
 				      struct r600_bytecode_alu *slots[5])
 {
-	struct alu_bank_swizzle bs;
-	int bank_swizzle[5];
-	int i, r = 0, forced = 1;
-	boolean scalar_only = bc->chip_class == CAYMAN ? false : true;
-	int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
-
-	for (i = 0; i < max_slots; i++) {
-		if (slots[i]) {
-			if (slots[i]->bank_swizzle_force) {
-				slots[i]->bank_swizzle = slots[i]->bank_swizzle_force;
-			} else {
-				forced = 0;
-			}
-		}
-
-		if (i < 4 && slots[i])
-			scalar_only = false;
-	}
-	if (forced)
+	struct r600_bytecode_alu *slot;
+	/* current bank swizzle values for slots */
+	int bswz[5];
+	int scnt, i, src_idx, src_count[5];
+	/* constant read port reservations */
+	int const_readports[4] = {};
+	int slot_count = 0, slot_map[5];
+	/* there is only one case when we have forced swizzles - four INTERP_XY/ZW
+	 * instructions in vector slots, and maybe something in the trans slot */
+	int vec_forced = (slots[0] && slots[0]->bank_swizzle_force);
+	int have_trans = (slots[4] != NULL);
+	/* number of constants, literals, immediates referenced by the trans slot */
+	int trans_const_cnt = 0;
+	/* gpr_sel[slot][src_idx] contains the encoded gpr sel&chan if we need
+	 * to allocate read port for this src, otherwise 0 */
+	int gpr_sel[5][3];
+	/* gpr_res[cycle][chan] contains mapping of sel&chan to readports
+	 * - main reservation table for gpr read ports */
+	int gpr_res[3][4] = {};
+	/* res_track[slot][src] contains encoded cycle&chan of readport, reserved
+	 * for corresponding src, used to rollback the reservations */
+	int res_track[5][3];
+
+	/* if we have only 4 forced vector slots, no need to check anything*/
+	if (vec_forced && !have_trans) {
+		for (i = 0; i < 4; ++i)
+			slots[i]->bank_swizzle = SQ_ALU_VEC_210;
 		return 0;
+	}
 
-	/* Just check every possible combination of bank swizzle.
-	 * Not very efficent, but works on the first try in most of the cases. */
-	for (i = 0; i < 4; i++)
-		if (!slots[i] || !slots[i]->bank_swizzle_force)
-			bank_swizzle[i] = SQ_ALU_VEC_012;
-		else
-			bank_swizzle[i] = slots[i]->bank_swizzle;
-
-	bank_swizzle[4] = SQ_ALU_SCL_210;
-	while(bank_swizzle[4] <= SQ_ALU_SCL_221) {
+	/* we'll check constants first and fill gpr_sel */
+	/* use reversed order to handle trans slot first, it's more limited */
+	for (i = 4; i >= 0; --i) {
+		if (!(slot = slots[i]))
+			continue;
 
-		if (max_slots == 4) {
-			for (i = 0; i < max_slots; i++) {
-				if (bank_swizzle[i] == SQ_ALU_VEC_210)
-				  return -1;
-			}
-		}
-		init_bank_swizzle(&bs);
-		if (scalar_only == false) {
-			for (i = 0; i < 4; i++) {
-				if (slots[i]) {
-					r = check_vector(bc, slots[i], &bs, bank_swizzle[i]);
-					if (r)
-						break;
+		slot_map[slot_count++] = i;
+		bswz[i] = slot->bank_swizzle_force ? slot->bank_swizzle_force : 0;
+		src_count[i] = scnt = r600_bytecode_get_num_operands(bc, slot);
+
+		for (src_idx = 0; src_idx < scnt; ++src_idx) {
+			struct r600_bytecode_alu_src *src = &slot->src[src_idx];
+			int ssel = src->sel;
+			gpr_sel[i][src_idx] = 0;
+
+			if (is_any_const(ssel)) {
+				/* trans slot can't use more than 2 constants of any type */
+				if (i == 4 && ++trans_const_cnt > 2)
+					return -1;
+
+				/* if kcache - try to reserve readport */
+				if (is_untranslated_const_sel(ssel) &&
+					reserve_const_readport((src->kc_bank << 12) + ssel,
+						src->chan, const_readports, bc->chip_class == R600))
+					return -1;
+
+			} else if (is_gpr(ssel)) {
+				ssel = (ssel << 2) + src->chan + 1;
+				/* vector slots have optimization for src[0] == src[1] case,
+				 * so we don't alloc gpr readport in this case */
+				if (i == 4 || src_idx != 1 || gpr_sel[i][0] != ssel) {
+					gpr_sel[i][src_idx] = ssel;
 				}
 			}
-		} else
-			r = 0;
-
-		if (!r && slots[4] && max_slots == 5) {
-			r = check_scalar(bc, slots[4], &bs, bank_swizzle[4]);
-		}
-		if (!r) {
-			for (i = 0; i < max_slots; i++) {
-				if (slots[i])
-					slots[i]->bank_swizzle = bank_swizzle[i];
-			}
-			return 0;
 		}
+	}
 
-		if (scalar_only) {
-			bank_swizzle[4]++;
-		} else {
-			for (i = 0; i < max_slots; i++) {
-				if (!slots[i] || !slots[i]->bank_swizzle_force) {
-					bank_swizzle[i]++;
-					if (bank_swizzle[i] <= SQ_ALU_VEC_210)
+	if (vec_forced) {
+		/* here we have four vector slots with forced swizzle and the trans
+		 * slot, so we only need to pre-reserve two gprs for INTERP's in forced
+		 * slots and try to find correct bank swizzle for the trans slot only.
+		 * INTERP uses gprs in src[0], two gprs are interleaved,
+		 * e.g. slot 0 - R0.y, 1 - R0.x, 2 - R0.y, 3 - R0.x */
+		int sel1 = gpr_sel[0][0], sel2 = gpr_sel[1][0];
+		int chan1 = (sel1-1) & 3, chan2 = (sel2-1) & 3;
+		/* gprs should be assigned to cycle 2 (INTERP uses VEC_210 swizzle) */
+		gpr_res[2][chan1] = sel1;
+		gpr_res[2][chan2] = sel2;
+		/* limit iteration to trans slot only */
+		slot_count = 1;
+	}
+
+	/* now handle gpr read ports */
+	/* loop over the slots, we are trying to find swizzles for each.
+	 * (we can step back to previous slot if we failed to find swizzle for the
+	 * current slot */
+	for (i = 0; i < slot_count; ++i) {
+		int nslot = slot_map[i];
+		int scalar = (nslot == 4);
+		int swz_cnt = (scalar ? 4 : 6);
+		int cbs = bswz[nslot];
+		int conflict = 1;
+
+		slot = slots[nslot];
+		scnt = src_count[nslot];
+
+		/* loop over the possible swizzle values for current slot */
+		while (cbs < swz_cnt) {
+			conflict = 0;
+			/* loop over the src */
+			for (src_idx = 0; src_idx < scnt; ++src_idx) {
+				int cycle = cycle_for_slot(nslot, cbs, src_idx);
+				int sel = gpr_sel[nslot][src_idx];
+				res_track[nslot][src_idx] = 0;
+
+				/* trans slot always reads constants on the first cycles,
+				 * so it can't read gpr/PV/PS on the cycle 0 if we have 1 const,
+				 * and on the cycles 0-1 if we have 2 constants */
+				if (scalar && cycle < trans_const_cnt &&
+					(sel || slot->src[src_idx].sel == V_SQ_ALU_SRC_PS ||
+							slot->src[src_idx].sel == V_SQ_ALU_SRC_PV)) {
+					conflict = 1;
+					break;
+				}
+				/* if src needs gpr read port */
+				if (sel) {
+					/* reserve gpr read port */
+					int chan = (sel-1) & 3;
+					int csel = gpr_res[cycle][chan];
+					if (csel == 0) {
+						/* if readport is free, reserve it */
+						gpr_res[cycle][chan] = sel;
+						/* and remember what we reserved */
+						res_track[nslot][src_idx] = (cycle<<2) + chan + 1;
+					} else if (csel != sel) {
+						/* port is already reserved for another element */
+						conflict = 1;
 						break;
-					else
-						bank_swizzle[i] = SQ_ALU_VEC_012;
+					}
 				}
 			}
+			if (conflict) {
+				/* revert reservations for this slot */
+				for (; src_idx > 0; --src_idx) {
+					int cc = res_track[nslot][src_idx-1];
+					if (cc) {
+						cc -= 1;
+						gpr_res[cc>>2][cc&3] = 0;
+					}
+				}
+				/* try next swizzle */
+				++cbs;
+			} else
+				/* no conflicts so far - go to the next slot */
+				break;
+		}
+		if (cbs == swz_cnt) {
+			/* swizzle not found, if it's a first slot - fail */
+			if (i == 0)
+				return -1;
+			/* reset swizzle for current slot */
+			bswz[nslot] = 0;
+			/* back to previous slot, (-2 because the loop will increment it */
+			i-=2;
+			/* use next swizzle for previous slot */
+			bswz[slot_map[i+1]] = bswz[slot_map[i+1]] + 1;
+		} else {
+			/* good swizzle found - remember it */
+			bswz[nslot] = cbs;
 		}
 	}
 
-	/* Couldn't find a working swizzle. */
-	return -1;
+	if (i < 0)
+		return -1;
+
+	/* swizzles found, assign them to instructions */
+	for(i = 0; i < 5; ++i) {
+		if (slots[i]) {
+			slots[i]->bank_swizzle = bswz[i];
+		}
+	}
+	return 0;
 }
 
 static int replace_gpr_with_pv_ps(struct r600_bytecode *bc,
-- 
1.8.0.2



More information about the mesa-dev mailing list