Mesa (master): nvfx: simplify and correct fragment program update logic

Luca Barbieri lb at kemper.freedesktop.org
Sun Aug 22 13:42:32 UTC 2010


Module: Mesa
Branch: master
Commit: 7de1f86c49716eeadb443507d16ead933288059c
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=7de1f86c49716eeadb443507d16ead933288059c

Author: Luca Barbieri <luca at luca-barbieri.com>
Date:   Sun Aug 22 12:02:41 2010 +0200

nvfx: simplify and correct fragment program update logic

This version should hopefully be much clearer and thus less likely
to be subtly broken.

Also fixes point sprites on nv40 and possibly some other bugs too.

---

 src/gallium/drivers/nvfx/nvfx_context.c    |    2 +
 src/gallium/drivers/nvfx/nvfx_context.h    |    2 +
 src/gallium/drivers/nvfx/nvfx_fragprog.c   |  185 +++++++++++++++++++---------
 src/gallium/drivers/nvfx/nvfx_state.h      |    3 +-
 src/gallium/drivers/nvfx/nvfx_state_emit.c |    2 +
 src/gallium/drivers/nvfx/nvfx_vertprog.c   |   49 +++-----
 6 files changed, 149 insertions(+), 94 deletions(-)

diff --git a/src/gallium/drivers/nvfx/nvfx_context.c b/src/gallium/drivers/nvfx/nvfx_context.c
index 8e85201..e78fc14 100644
--- a/src/gallium/drivers/nvfx/nvfx_context.c
+++ b/src/gallium/drivers/nvfx/nvfx_context.c
@@ -90,6 +90,8 @@ nvfx_create(struct pipe_screen *pscreen, void *priv)
 
 	/* set these to that we init them on first validation */
 	nvfx->state.scissor_enabled = ~0;
+	nvfx->hw_pointsprite_control = -1;
+	nvfx->hw_vp_output = -1;
 	nvfx->use_vertex_buffers = -1;
 
 	LIST_INITHEAD(&nvfx->render_cache);
diff --git a/src/gallium/drivers/nvfx/nvfx_context.h b/src/gallium/drivers/nvfx/nvfx_context.h
index 63fbce8..fb4a9da 100644
--- a/src/gallium/drivers/nvfx/nvfx_context.h
+++ b/src/gallium/drivers/nvfx/nvfx_context.h
@@ -193,6 +193,8 @@ struct nvfx_context {
 	uint32_t hw_txf[8];
 	struct nvfx_render_target hw_rt[4];
 	struct nvfx_render_target hw_zeta;
+	int hw_pointsprite_control;
+	int hw_vp_output;
 };
 
 static INLINE struct nvfx_context *
diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
index 025989a..e40a814 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -1101,6 +1101,35 @@ nvfx_fp_memcpy(void* dst, const void* src, size_t len)
 #endif
 }
 
+/* The hardware only supports immediate constants inside the fragment program,
+ * and at least on nv30 doesn't support an indirect linkage table.
+ *
+ * Hence, we need to patch the fragment program itself both to update constants
+ * and update linkage.
+ *
+ * Using a single fragment program would entail unacceptable stalls if the GPU is
+ * already rendering with that fragment program.
+ * Thus, we instead use a "rotating queue" of buffer objects, each of which is
+ * packed with multiple versions of the same program.
+ *
+ * Whenever we need to patch something, we move to the next program and
+ * patch it. If all buffer objects are in use by the GPU, we allocate another one,
+ * expanding the queue.
+ *
+ * As an additional optimization, we record when all the programs have the
+ * current input slot configuration, and at that point we stop patching inputs.
+ * This happens, for instance, if a given fragment program is always used with
+ * the same vertex program (i.e. always with GLSL), or if the layouts match
+ * enough (non-GLSL).
+ *
+ * Note that instead of using multiple programs, we could push commands
+ * on the FIFO to patch a single program: it's not fully clear which option is
+ * faster, but my guess is that the current way is faster.
+ *
+ * We also track the previous slot assignments for each version and don't
+ * patch if they are the same (this could perhaps be removed).
+ */
+
 void
 nvfx_fragprog_validate(struct nvfx_context *nvfx)
 {
@@ -1109,6 +1138,7 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 	int update = 0;
 	struct nvfx_vertex_program* vp;
 	unsigned sprite_coord_enable;
+	boolean update_pointsprite = !!(nvfx->dirty & NVFX_NEW_FRAGPROG);
 
 	if (!fp->translated)
 	{
@@ -1141,80 +1171,96 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 		fp->bo_prog_idx = fp->progs_per_bo - 1;
 	}
 
-	/* we must update constants even on "just" fragprog changes, because
-	   we don't check whether the current constant buffer matches the latest
-	   one bound to this fragment program */
-	if (nvfx->dirty & (NVFX_NEW_FRAGCONST | NVFX_NEW_FRAGPROG))
-		update = TRUE;
-
 	vp = nvfx->render_mode == HW ? nvfx->vertprog : nvfx->swtnl.vertprog;
-	if (fp->last_vp_id != vp->id) {
-		char* vp_sem_table = vp->generic_to_fp_input;
-		unsigned char* fp_semantics = fp->slot_to_generic;
-		unsigned diff = 0;
-		unsigned char* cur_slots;
-		fp->last_vp_id = nvfx->vertprog->id;
-		cur_slots = fp->slot_to_fp_input;
-		for(unsigned i = 0; i < fp->num_slots; ++i) {
-			unsigned char slot_mask = vp_sem_table[fp_semantics[i]];
-			diff |= (slot_mask >> 4) & (slot_mask ^ cur_slots[i]);
-		}
+        sprite_coord_enable = nvfx->rasterizer->pipe.point_quad_rasterization * nvfx->rasterizer->pipe.sprite_coord_enable;
+
+	if (fp->last_vp_id != vp->id || fp->last_sprite_coord_enable != sprite_coord_enable) {
+		int sprite_input = -1;
+		unsigned i;
+		fp->last_vp_id = vp->id;
+		fp->last_sprite_coord_enable = sprite_coord_enable;
 
-		if(diff)
+		if(sprite_coord_enable)
 		{
-			for(unsigned i = 0; i < fp->num_slots; ++i) {
-				/* if 0xff, then this will write to the dummy value at fp->last_layout_mask[0] */
-				fp->slot_to_fp_input[i] = vp_sem_table[fp_semantics[i]] & 0xf;
-				//printf("fp: GENERIC[%i] from fpreg %i\n", fp_semantics[i], fp->slot_to_fp_input[i]);
+			sprite_input = vp->sprite_fp_input;
+			if(sprite_input < 0)
+			{
+				unsigned used_texcoords = 0;
+				for(unsigned i = 0; i < fp->num_slots; ++i) {
+					unsigned generic = fp->slot_to_generic[i];
+					if(!((1 << generic) & sprite_coord_enable))
+					{
+						unsigned char slot_mask = vp->generic_to_fp_input[generic];
+						if(slot_mask >= 0xf0)
+							used_texcoords |= 1 << ((slot_mask & 0xf) - NVFX_FP_OP_INPUT_SRC_TC0);
+					}
+				}
+
+				sprite_input = NVFX_FP_OP_INPUT_SRC_TC(__builtin_ctz(~used_texcoords));
 			}
 
-			fp->progs_left_with_obsolete_slot_assignments = fp->progs;
-			update = TRUE;
+			fp->point_sprite_control |= (1 << (sprite_input - NVFX_FP_OP_INPUT_SRC_TC0 + 8));
 		}
-	}
+		else
+			fp->point_sprite_control = 0;
 
-	// last_sprite_coord_enable
-	sprite_coord_enable = nvfx->rasterizer->pipe.point_quad_rasterization * nvfx->rasterizer->pipe.sprite_coord_enable;
-	if(fp->last_sprite_coord_enable != sprite_coord_enable)
-	{
-		unsigned texcoord_mask = vp->texcoord_ouput_mask;
-		fp->last_sprite_coord_enable = sprite_coord_enable;
-		fp->point_sprite_control = 0;
-		for(unsigned i = 0; i < fp->num_slots; ++i) {
-			if((1 << fp->slot_to_generic[i]) & sprite_coord_enable)
+		for(i = 0; i < fp->num_slots; ++i) {
+			unsigned generic = fp->slot_to_generic[i];
+			if((1 << generic) & sprite_coord_enable)
 			{
-				unsigned fpin = fp->slot_to_fp_input[i];
-				//printf("sprite: slot %i generic %i had texcoord %i\n", i, fp->slot_to_generic[i], fpin - NVFX_FP_OP_INPUT_SRC_TC0);
-				if(fpin >= 0x0f)
-				{
-					unsigned tc = __builtin_ctz(~texcoord_mask);
-					texcoord_mask |= (1 << tc);
-					fp->slot_to_fp_input[i] = fpin = NVFX_FP_OP_INPUT_SRC_TC(tc);
-
-					fp->progs_left_with_obsolete_slot_assignments = fp->progs;
-					update = TRUE;
-				}
-				//printf("sprite: slot %i texcoord %i\n", i, fpin - NVFX_FP_OP_INPUT_SRC_TC0);
-				fp->point_sprite_control |= (1 << (fpin - NVFX_FP_OP_INPUT_SRC_TC0 + 8));
+				if(fp->slot_to_fp_input[i] != sprite_input)
+					goto update_slots;
 			}
 			else
 			{
-				unsigned fpin = fp->slot_to_fp_input[i];
-				if(!(vp->texcoord_ouput_mask & (1 << (fpin - NVFX_FP_OP_INPUT_SRC_TC0))))
-				{
-					fp->slot_to_fp_input[i] = 0x0f;
+				unsigned char slot_mask = vp->generic_to_fp_input[generic];
+				if((slot_mask >> 4) & (slot_mask ^ fp->slot_to_fp_input[i]))
+					goto update_slots;
+			}
+		}
+
+		if(0)
+		{
+update_slots:
+			/* optimization: we start updating from the slot we found the first difference in */
+			for(; i < fp->num_slots; ++i)
+			{
+				unsigned generic = fp->slot_to_generic[i];
+				if((1 << generic) & sprite_coord_enable)
+					fp->slot_to_fp_input[i] = sprite_input;
+				else
+					fp->slot_to_fp_input[i] = vp->generic_to_fp_input[generic] & 0xf;
+			}
 
-					fp->progs_left_with_obsolete_slot_assignments = fp->progs;
-					update = TRUE;
+			if(nvfx->is_nv4x)
+			{
+				fp->or = 0;
+				for(i = 0; i < fp->num_slots; ++i) {
+					unsigned fp_input = fp->slot_to_fp_input[i];
+					if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(8))
+						fp->or |= (1 << 12);
+					else if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(9))
+						fp->or |= (1 << 13);
+					else if(fp_input != 0xf)
+						fp->or |= (1 << (fp_input - NVFX_FP_OP_INPUT_SRC_TC0 + 14));
 				}
 			}
+
+			fp->progs_left_with_obsolete_slot_assignments = fp->progs;
+			goto update;
 		}
 	}
 
-	if(update) {
+	/* We must update constants even on "just" fragprog changes, because
+	  * we don't check whether the current constant buffer matches the latest
+	  * one bound to this fragment program.
+	  * Doing such a check would likely be a pessimization.
+	  */
+	if (nvfx->dirty & (NVFX_NEW_FRAGCONST | NVFX_NEW_FRAGPROG)) {
 		int offset;
 		uint32_t* fpmap;
 
+update:
 		++fp->bo_prog_idx;
 		if(fp->bo_prog_idx >= fp->progs_per_bo)
 		{
@@ -1278,6 +1324,9 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 			}
 		}
 
+		/* we only do this if we aren't sure that all program versions have the
+		 * current slot assignments, otherwise we just update constants for speed
+		 */
 		if(fp->progs_left_with_obsolete_slot_assignments) {
 			unsigned char* fpbo_slots = &fp->fpbo->slots[fp->bo_prog_idx * 8];
 			for(unsigned i = 0; i < fp->num_slots; ++i) {
@@ -1296,10 +1345,7 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 			}
 			--fp->progs_left_with_obsolete_slot_assignments;
 		}
-	}
 
-	if(update || (nvfx->dirty & NVFX_NEW_FRAGPROG)) {
-		int offset = fp->bo_prog_idx * fp->prog_size;
 		MARK_RING(chan, 8, 1);
 		OUT_RING(chan, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1));
 		OUT_RELOC(chan, fp->fpbo->bo, offset, NOUVEAU_BO_VRAM |
@@ -1316,11 +1362,28 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 		}
 	}
 
-	if(nvfx->dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_SPRITE))
 	{
-		WAIT_RING(chan, 2);
-		OUT_RING(chan, RING_3D(NV34TCL_POINT_SPRITE, 1));
-		OUT_RING(chan, fp->point_sprite_control | nvfx->rasterizer->pipe.point_quad_rasterization);
+		unsigned pointsprite_control = fp->point_sprite_control | nvfx->rasterizer->pipe.point_quad_rasterization;
+		if(pointsprite_control != nvfx->hw_pointsprite_control)
+		{
+			WAIT_RING(chan, 2);
+			OUT_RING(chan, RING_3D(NV34TCL_POINT_SPRITE, 1));
+			OUT_RING(chan, pointsprite_control);
+			nvfx->hw_pointsprite_control = pointsprite_control;
+		}
+	}
+
+	if(nvfx->is_nv4x)
+	{
+		unsigned vp_output = vp->or | fp->or;
+
+		if(vp_output != nvfx->hw_vp_output)
+		{
+			WAIT_RING(chan, 2);
+			OUT_RING(chan, RING_3D(NV40TCL_VP_RESULT_EN, 1));
+			OUT_RING(chan, vp_output);
+			nvfx->hw_vp_output = vp_output;
+		}
 	}
 }
 
diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h
index 1247abc..05d41cf 100644
--- a/src/gallium/drivers/nvfx/nvfx_state.h
+++ b/src/gallium/drivers/nvfx/nvfx_state.h
@@ -32,7 +32,7 @@ struct nvfx_vertex_program {
 	unsigned nr_consts;
 
 	char generic_to_fp_input[256];
-	unsigned texcoord_ouput_mask;
+	int sprite_fp_input;
 
 	struct nouveau_resource *exec;
 	unsigned exec_start;
@@ -67,6 +67,7 @@ struct nvfx_fragment_program {
 	boolean translated;
 	unsigned samplers;
 	unsigned point_sprite_control;
+	unsigned or;
 
 	uint32_t *insn;
 	int       insn_len;
diff --git a/src/gallium/drivers/nvfx/nvfx_state_emit.c b/src/gallium/drivers/nvfx/nvfx_state_emit.c
index 8e3c342..bd89a38 100644
--- a/src/gallium/drivers/nvfx/nvfx_state_emit.c
+++ b/src/gallium/drivers/nvfx/nvfx_state_emit.c
@@ -17,6 +17,8 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 	{
 		nvfx->dirty = ~0;
 		nvfx->hw_vtxelt_nr = 16;
+		nvfx->hw_pointsprite_control = -1;
+		nvfx->hw_vp_output = -1;
 		nvfx->screen->cur_ctx = nvfx;
 	}
 
diff --git a/src/gallium/drivers/nvfx/nvfx_vertprog.c b/src/gallium/drivers/nvfx/nvfx_vertprog.c
index 3c3521e..806f263 100644
--- a/src/gallium/drivers/nvfx/nvfx_vertprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_vertprog.c
@@ -235,24 +235,10 @@ emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot
 			dst.index = NVFX_VP(INST_DEST_PSZ);
 			break;
 		default:
-			if(!nvfx->is_nv4x) {
-				switch (dst.index) {
-				case NV30_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
-				case NV30_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
-				case NV30_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
-				case NV30_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
-				case NV30_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
-				case NV30_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
-				case NV30_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
-				case NV30_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
-				case NV30_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
-				case NV30_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
-				case NV30_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
-				case NV30_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
-				case NV30_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
-				case NV30_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
-				}
-			} else {
+			if(nvfx->is_nv4x) {
+				/* we don't need vp->or on nv3x
+				 * texcoords are handled by fragment program
+				 */
 				switch (dst.index) {
 				case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
 				case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
@@ -260,14 +246,6 @@ emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot
 				case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
 				case NV40_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
 				case NV40_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
-				case NV40_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
-				case NV40_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
-				case NV40_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
-				case NV40_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
-				case NV40_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
-				case NV40_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
-				case NV40_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
-				case NV40_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
 				}
 			}
 			break;
@@ -817,13 +795,21 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
 
 	/* hope 0xf is (0, 0, 0, 1) initialized; otherwise, we are _probably_ not required to do this */
 	memset(vpc->vp->generic_to_fp_input, 0x0f, sizeof(vpc->vp->generic_to_fp_input));
-	vpc->vp->texcoord_ouput_mask = 0;
 	for(int i = 0; i < 8; ++i) {
 		if(sem_layout[i] == 0xff)
 			continue;
-		vpc->vp->texcoord_ouput_mask |= (1 << i);
 		//printf("vp: GENERIC[%i] to fpreg %i\n", sem_layout[i], NVFX_FP_OP_INPUT_SRC_TC(0) + i);
-		vpc->vp->generic_to_fp_input[sem_layout[i]] = 0xf0 | (NVFX_FP_OP_INPUT_SRC_TC(0) + i);
+		vpc->vp->generic_to_fp_input[sem_layout[i]] = 0xf0 | NVFX_FP_OP_INPUT_SRC_TC(i);
+	}
+
+	vpc->vp->sprite_fp_input = -1;
+	for(int i = 0; i < 8; ++i)
+	{
+		if(sem_layout[i] == 0xff)
+		{
+			vpc->vp->sprite_fp_input = NVFX_FP_OP_INPUT_SRC_TC(i);
+			break;
+		}
 	}
 
 	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
@@ -1233,13 +1219,12 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 
 	if(nvfx->dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_UCP))
 	{
-		WAIT_RING(chan, 7);
+		WAIT_RING(chan, 6);
 		OUT_RING(chan, RING_3D(NV34TCL_VP_START_FROM_ID, 1));
 		OUT_RING(chan, vp->exec->start);
 		if(nvfx->is_nv4x) {
-			OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 2));
+			OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 1));
 			OUT_RING(chan, vp->ir);
-			OUT_RING(chan, vp->or);
 		}
 		OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANES_ENABLE, 1));
 		OUT_RING(chan, vp->clip_ctrl);




More information about the mesa-commit mailing list