[Mesa-dev] [PATCH 1/2] r600g: improve kcache line sets handling v2

Vadim Girlin vadimgirlin at gmail.com
Fri Jan 20 11:24:32 PST 2012


Add support for multiple kcache banks (constant buffers).
Lock the required lines only.
Allow up to 4 kcache line sets in the alu clause by using ALU_EXTENDED on eg+.

Signed-off-by: Vadim Girlin <vadimgirlin at gmail.com>
---

v2: fix a bug in the r600_bytecode_assign_kcache_banks

 src/gallium/drivers/r600/eg_asm.c   |   17 +++
 src/gallium/drivers/r600/eg_sq.h    |   49 +++++++-
 src/gallium/drivers/r600/r600_asm.c |  232 ++++++++++++++++++++++-------------
 src/gallium/drivers/r600/r600_asm.h |    4 +-
 4 files changed, 216 insertions(+), 86 deletions(-)

diff --git a/src/gallium/drivers/r600/eg_asm.c b/src/gallium/drivers/r600/eg_asm.c
index 877e162..e867ea4 100644
--- a/src/gallium/drivers/r600/eg_asm.c
+++ b/src/gallium/drivers/r600/eg_asm.c
@@ -38,6 +38,23 @@ int eg_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf)
 	case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
 	case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
 	case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
+		/* prepend ALU_EXTENDED if we need more than 2 kcache sets */
+		if (cf->eg_alu_extended) {
+			bc->bytecode[id++] =
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0(V_SQ_CF_INDEX_NONE) |
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1(V_SQ_CF_INDEX_NONE) |
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2(V_SQ_CF_INDEX_NONE) |
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3(V_SQ_CF_INDEX_NONE) |
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2(cf->kcache[2].bank) |
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3(cf->kcache[3].bank) |
+				S_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2(cf->kcache[2].mode);
+			bc->bytecode[id++] = EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_EXTENDED |
+				S_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3(cf->kcache[3].mode) |
+				S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2(cf->kcache[2].addr) |
+				S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3(cf->kcache[3].addr) |
+				S_SQ_CF_ALU_WORD1_EXT_BARRIER(1);
+		}
+
 		bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
 			S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
 			S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
diff --git a/src/gallium/drivers/r600/eg_sq.h b/src/gallium/drivers/r600/eg_sq.h
index 854c1b8..eba42d0 100644
--- a/src/gallium/drivers/r600/eg_sq.h
+++ b/src/gallium/drivers/r600/eg_sq.h
@@ -78,6 +78,10 @@
 #define   S_SQ_CF_ALU_WORD0_KCACHE_MODE0(x)                          (((x) & 0x3) << 30)
 #define   G_SQ_CF_ALU_WORD0_KCACHE_MODE0(x)                          (((x) >> 30) & 0x3)
 #define   C_SQ_CF_ALU_WORD0_KCACHE_MODE0                             0x3FFFFFFF
+#define     V_SQ_CF_KCACHE_NOP                                       0x00000000
+#define     V_SQ_CF_KCACHE_LOCK_1                                    0x00000001
+#define     V_SQ_CF_KCACHE_LOCK_2                                    0x00000002
+#define     V_SQ_CF_KCACHE_LOCK_LOOP_INDEX                           0x00000003
 #define P_SQ_CF_ALU_WORD1
 #define   S_SQ_CF_ALU_WORD1_KCACHE_MODE1(x)                          (((x) & 0x3) << 0)
 #define   G_SQ_CF_ALU_WORD1_KCACHE_MODE1(x)                          (((x) >> 0) & 0x3)
@@ -103,7 +107,50 @@
 #define   S_SQ_CF_ALU_WORD1_BARRIER(x)                               (((x) & 0x1) << 31)
 #define   G_SQ_CF_ALU_WORD1_BARRIER(x)                               (((x) >> 31) & 0x1)
 #define   C_SQ_CF_ALU_WORD1_BARRIER                                  0x7FFFFFFF
-/* extended TODO */
+
+#define P_SQ_CF_ALU_WORD0_EXT
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0(x)           (((x) & 0x3) << 4)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0(x)           (((x) >> 4) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE0              0xFFFFFFCF
+#define	    V_SQ_CF_INDEX_NONE                                       0x00
+#define	    V_SQ_CF_INDEX_0                                          0x01
+#define	    V_SQ_CF_INDEX_1                                          0x02
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1(x)           (((x) & 0x3) << 6)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1(x)           (((x) >> 6) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE1              0xFFFFFF3F
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2(x)           (((x) & 0x3) << 8)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2(x)           (((x) >> 8) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE2              0xFFFFFCFF
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3(x)           (((x) & 0x3) << 10)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3(x)           (((x) >> 10) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK_INDEX_MODE3              0xFFFFF3FF
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2(x)                      (((x) & 0xF) << 22)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2(x)                      (((x) >> 22) & 0xF)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK2                         0xFC3FFFFF
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3(x)                      (((x) & 0xF) << 26)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3(x)                      (((x) >> 26) & 0xF)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_BANK3                         0xC3FFFFFF
+#define   S_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2(x)                      (((x) & 0x3) << 30)
+#define   G_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2(x)                      (((x) >> 30) & 0x3)
+#define   C_SQ_CF_ALU_WORD0_EXT_KCACHE_MODE2                         0x3FFFFFFF
+
+#define P_SQ_CF_ALU_WORD1_EXT
+#define   S_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3(x)                      (((x) & 0x3) << 0)
+#define   G_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3(x)                      (((x) >> 0) & 0x3)
+#define   C_SQ_CF_ALU_WORD1_EXT_KCACHE_MODE3                         0xFFFFFFFC
+#define   S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2(x)                      (((x) & 0xFF) << 2)
+#define   G_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2(x)                      (((x) >> 2) & 0xFF)
+#define   C_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR2                         0xFFFFFC03
+#define   S_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3(x)                      (((x) & 0xFF) << 10)
+#define   G_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3(x)                      (((x) >> 10) & 0xFF)
+#define   C_SQ_CF_ALU_WORD1_EXT_KCACHE_ADDR3                         0xFFFC03FF
+#define   S_SQ_CF_ALU_WORD1_EXT_CF_INST(x)                           (((x) & 0xF) << 26)
+#define   G_SQ_CF_ALU_WORD1_EXT_CF_INST(x)                           (((x) >> 26) & 0xF)
+#define   C_SQ_CF_ALU_WORD1_EXT_CF_INST                              0xC3FFFFFF
+#define   S_SQ_CF_ALU_WORD1_EXT_BARRIER(x)                           (((x) & 0x1) << 31)
+#define   G_SQ_CF_ALU_WORD1_EXT_BARRIER(x)                           (((x) >> 31) & 0x1)
+#define   C_SQ_CF_ALU_WORD1_EXT_BARRIER                              0x7FFFFFFF
+
 /* done */
 #define P_SQ_CF_ALLOC_EXPORT_WORD0
 #define   S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(x)                   (((x) & 0x1FFF) << 0)
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 8234744..e63ace1 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -262,8 +262,14 @@ static int r600_bytecode_add_cf(struct r600_bytecode *bc)
 	if (cf == NULL)
 		return -ENOMEM;
 	LIST_ADDTAIL(&cf->list, &bc->cf);
-	if (bc->cf_last)
+	if (bc->cf_last) {
 		cf->id = bc->cf_last->id + 2;
+		if (bc->cf_last->eg_alu_extended) {
+			/* take into account extended alu size */
+			cf->id += 2;
+			bc->ndw += 2;
+		}
+	}
 	bc->cf_last = cf;
 	bc->ncf++;
 	bc->ndw += 2;
@@ -1122,116 +1128,157 @@ static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu
 	return 0;
 }
 
-/* This code handles kcache lines as single blocks of 32 constants. We could
- * probably do slightly better by recognizing that we actually have two
- * consecutive lines of 16 constants, but the resulting code would also be
- * somewhat more complicated. */
-static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, int type)
+/* we'll keep kcache sets sorted by bank & addr */
+static int r600_bytecode_alloc_kcache_line(struct r600_bytecode *bc,
+		struct r600_bytecode_kcache *kcache,
+		unsigned bank, unsigned line)
 {
-	struct r600_bytecode_kcache *kcache = bc->cf_last->kcache;
-	unsigned int required_lines;
-	unsigned int free_lines = 0;
-	unsigned int cache_line[3];
-	unsigned int count = 0;
-	unsigned int i, j;
-	int r;
+	int i, kcache_banks = bc->chip_class >= EVERGREEN ? 4 : 2;
 
-	/* Collect required cache lines. */
-	for (i = 0; i < 3; ++i) {
-		boolean found = false;
-		unsigned int line;
+	for (i = 0; i < kcache_banks; i++) {
+		if (kcache[i].mode) {
+			int d;
 
-		if (alu->src[i].sel < 512)
-			continue;
+			if (kcache[i].bank < bank)
+				continue;
 
-		line = ((alu->src[i].sel - 512) / 32) * 2;
+			if ((kcache[i].bank == bank && kcache[i].addr > line+1) ||
+					kcache[i].bank > bank) {
+				/* try to insert new line */
+				if (kcache[kcache_banks-1].mode) {
+					/* all sets are in use */
+					return -ENOMEM;
+				}
 
-		for (j = 0; j < count; ++j) {
-			if (cache_line[j] == line) {
-				found = true;
-				break;
+				memmove(&kcache[i+1],&kcache[i], (kcache_banks-i-1)*sizeof(struct r600_bytecode_kcache));
+				kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
+				kcache[i].bank = bank;
+				kcache[i].addr = line;
+				return 0;
 			}
-		}
 
-		if (!found)
-			cache_line[count++] = line;
-	}
+			d = line - kcache[i].addr;
 
-	/* This should never actually happen. */
-	if (count >= 3) return -ENOMEM;
-
-	for (i = 0; i < 2; ++i) {
-		if (kcache[i].mode == V_SQ_CF_KCACHE_NOP) {
-			++free_lines;
-		}
-	}
-
-	/* Filter lines pulled in by previous intructions. Note that this is
-	 * only for the required_lines count, we can't remove these from the
-	 * cache_line array since we may have to start a new ALU clause. */
-	for (i = 0, required_lines = count; i < count; ++i) {
-		for (j = 0; j < 2; ++j) {
-			if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
-			    kcache[j].addr == cache_line[i]) {
-				--required_lines;
-				break;
-			}
+			if (d == -1) {
+				kcache[i].addr--;
+				if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_2) {
+					/* we are prepending the line to the current set,
+					 * discarding the existing second line,
+					 * so we'll have to insert line+2 after it */
+					line += 2;
+					continue;
+				} else if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_1) {
+					kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
+					return 0;
+				} else {
+					/* V_SQ_CF_KCACHE_LOCK_LOOP_INDEX is not supported */
+					return -ENOMEM;
+				}
+			} else if (d == 1) {
+				kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
+				return 0;
+			} else if (d == 0)
+				return 0;
+		} else { /* free kcache set - use it */
+			kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
+			kcache[i].bank = bank;
+			kcache[i].addr = line;
+			return 0;
 		}
 	}
+	return -ENOMEM;
+}
 
-	/* Start a new ALU clause if needed. */
-	if (required_lines > free_lines) {
-		if ((r = r600_bytecode_add_cf(bc))) {
-			return r;
-		}
-		bc->cf_last->inst = type;
-		kcache = bc->cf_last->kcache;
-	}
+static int r600_bytecode_alloc_inst_kcache_lines(struct r600_bytecode *bc,
+		struct r600_bytecode_kcache *kcache,
+		struct r600_bytecode_alu *alu)
+{
+	int i, r;
 
-	/* Setup the kcache lines. */
-	for (i = 0; i < count; ++i) {
-		boolean found = false;
+	for (i = 0; i < 3; i++) {
+		unsigned bank, line, sel = alu->src[i].sel;
 
-		for (j = 0; j < 2; ++j) {
-			if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
-			    kcache[j].addr == cache_line[i]) {
-				found = true;
-				break;
-			}
-		}
+		if (sel < 512)
+			continue;
 
-		if (found) continue;
+		bank = alu->src[i].kc_bank;
+		line = (sel-512)>>4;
 
-		for (j = 0; j < 2; ++j) {
-			if (kcache[j].mode == V_SQ_CF_KCACHE_NOP) {
-				kcache[j].bank = 0;
-				kcache[j].addr = cache_line[i];
-				kcache[j].mode = V_SQ_CF_KCACHE_LOCK_2;
-				break;
-			}
-		}
+		if ((r = r600_bytecode_alloc_kcache_line(bc, kcache, bank, line)))
+			return r;
 	}
+	return 0;
+}
+
+static int r600_bytecode_assign_kcache_banks(struct r600_bytecode *bc,
+		struct r600_bytecode_alu *alu,
+		struct r600_bytecode_kcache * kcache)
+{
+	int i, j;
 
 	/* Alter the src operands to refer to the kcache. */
 	for (i = 0; i < 3; ++i) {
 		static const unsigned int base[] = {128, 160, 256, 288};
-		unsigned int line;
+		unsigned int line, sel = alu->src[i].sel, found = 0;
 
-		if (alu->src[i].sel < 512)
+		if (sel < 512)
 			continue;
 
-		alu->src[i].sel -= 512;
-		line = (alu->src[i].sel / 32) * 2;
+		sel -= 512;
+		line = sel>>4;
 
-		for (j = 0; j < 2; ++j) {
-			if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
-			    kcache[j].addr == line) {
-				alu->src[i].sel &= 0x1f;
-				alu->src[i].sel += base[j];
-				break;
+		for (j = 0; j < 4 && !found; ++j) {
+			switch (kcache[j].mode) {
+			case V_SQ_CF_KCACHE_NOP:
+			case V_SQ_CF_KCACHE_LOCK_LOOP_INDEX:
+				R600_ERR("unexpected kcache line mode\n");
+				return -ENOMEM;
+			default:
+				if (kcache[j].bank == alu->src[i].kc_bank &&
+						kcache[j].addr <= line &&
+						line < kcache[j].addr + kcache[j].mode) {
+					alu->src[i].sel = sel - (kcache[j].addr<<4);
+					alu->src[i].sel += base[j];
+					found=1;
+			    }
 			}
 		}
 	}
+	return 0;
+}
+
+static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, int type)
+{
+	struct r600_bytecode_kcache kcache_sets[4];
+	struct r600_bytecode_kcache *kcache = kcache_sets;
+	int r;
+
+	memcpy(kcache, bc->cf_last->kcache, 4 * sizeof(struct r600_bytecode_kcache));
+
+	if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
+		/* can't alloc, need to start new clause */
+		if ((r = r600_bytecode_add_cf(bc))) {
+			return r;
+		}
+		bc->cf_last->inst = type;
+
+		/* retry with the new clause */
+		kcache = bc->cf_last->kcache;
+		if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
+			/* can't alloc again- should never happen */
+			return r;
+		}
+	} else {
+		/* update kcache sets */
+		memcpy(bc->cf_last->kcache, kcache, 4 * sizeof(struct r600_bytecode_kcache));
+	}
+
+	/* if we actually used more than 2 kcache sets - use ALU_EXTENDED on eg+ */
+	if (kcache[2].mode != V_SQ_CF_KCACHE_NOP) {
+		if (bc->chip_class < EVERGREEN)
+			return -ENOMEM;
+		bc->cf_last->eg_alu_extended = 1;
+	}
 
 	return 0;
 }
@@ -1848,6 +1895,8 @@ int r600_bytecode_build(struct r600_bytecode *bc)
 					if (r)
 						return r;
 					r600_bytecode_alu_adjust_literals(bc, alu, literal, nliteral);
+					r600_bytecode_assign_kcache_banks(bc, alu, cf->kcache);
+
 					switch(bc->chip_class) {
 					case EVERGREEN: /* eg alu is same encoding as r700 */
 					case CAYMAN:
@@ -1943,6 +1992,8 @@ int r600_bytecode_build(struct r600_bytecode *bc)
 					if (r)
 						return r;
 					r600_bytecode_alu_adjust_literals(bc, alu, literal, nliteral);
+					r600_bytecode_assign_kcache_banks(bc, alu, cf->kcache);
+
 					switch(bc->chip_class) {
 					case R600:
 						r = r600_bytecode_alu_build(bc, alu, addr);
@@ -2083,6 +2134,19 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER:
 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER:
 			case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
+				if (cf->eg_alu_extended) {
+					fprintf(stderr, "%04d %08X ALU_EXT0 ", id, bc->bytecode[id]);
+					fprintf(stderr, "KCACHE_BANK2:%X ", cf->kcache[2].bank);
+					fprintf(stderr, "KCACHE_BANK3:%X ", cf->kcache[3].bank);
+					fprintf(stderr, "KCACHE_MODE2:%X\n", cf->kcache[2].mode);
+					id++;
+					fprintf(stderr, "%04d %08X ALU_EXT1 ", id, bc->bytecode[id]);
+					fprintf(stderr, "KCACHE_MODE3:%X ", cf->kcache[3].mode);
+					fprintf(stderr, "KCACHE_ADDR2:%X ", cf->kcache[2].addr);
+					fprintf(stderr, "KCACHE_ADDR3:%X\n", cf->kcache[3].addr);
+					id++;
+				}
+
 				fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
 				fprintf(stderr, "ADDR:%d ", cf->addr);
 				fprintf(stderr, "KCACHE_MODE0:%X ", cf->kcache[0].mode);
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
index d0ff75d..a5b3425 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -32,6 +32,7 @@ struct r600_bytecode_alu_src {
 	unsigned			neg;
 	unsigned			abs;
 	unsigned			rel;
+	unsigned			kc_bank;
 	uint32_t			value;
 };
 
@@ -143,8 +144,9 @@ struct r600_bytecode_cf {
 	unsigned			cond;
 	unsigned			pop_count;
 	unsigned			cf_addr; /* control flow addr */
-	struct r600_bytecode_kcache		kcache[2];
+	struct r600_bytecode_kcache		kcache[4];
 	unsigned			r6xx_uses_waterfall;
+	unsigned			eg_alu_extended;
 	struct list_head		alu;
 	struct list_head		tex;
 	struct list_head		vtx;
-- 
1.7.7.5



More information about the mesa-dev mailing list