Mesa (master): r600g: build fetch shader from vertex elements

Jerome Glisse glisse at kemper.freedesktop.org
Mon Dec 6 20:54:28 UTC 2010


Module: Mesa
Branch: master
Commit: fa86fc564aea4e40c89f6fc889e6a5bf817634b3
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=fa86fc564aea4e40c89f6fc889e6a5bf817634b3

Author: Jerome Glisse <jglisse at redhat.com>
Date:   Fri Dec  3 20:47:02 2010 -0500

r600g: build fetch shader from vertex elements

Vertex elements change are less frequent than draw call, those to
avoid rebuilding fetch shader to often build the fetch shader along
vertex elements. This also allow to move vertex buffer setup out
of draw path and make update to it less frequent.

Shader update can still be improved to only update SPI regs (based
on some rasterizer state like flat shading or point sprite ...).

Signed-off-by: Jerome Glisse <jglisse at redhat.com>

---

 src/gallium/drivers/r600/eg_asm.c            |   35 +++
 src/gallium/drivers/r600/evergreen_state.c   |   97 ++++++++-
 src/gallium/drivers/r600/r600_asm.c          |  317 ++++++++++++++++++++++++++
 src/gallium/drivers/r600/r600_asm.h          |    8 +
 src/gallium/drivers/r600/r600_buffer.c       |    5 +-
 src/gallium/drivers/r600/r600_pipe.c         |    2 +
 src/gallium/drivers/r600/r600_pipe.h         |   20 ++-
 src/gallium/drivers/r600/r600_shader.c       |   18 +-
 src/gallium/drivers/r600/r600_state.c        |   83 +++++++
 src/gallium/drivers/r600/r600_state_common.c |   35 +++-
 src/gallium/drivers/r600/r600_translate.c    |   43 ++--
 11 files changed, 619 insertions(+), 44 deletions(-)

diff --git a/src/gallium/drivers/r600/eg_asm.c b/src/gallium/drivers/r600/eg_asm.c
index 21d66fa..b79875c 100644
--- a/src/gallium/drivers/r600/eg_asm.c
+++ b/src/gallium/drivers/r600/eg_asm.c
@@ -27,6 +27,7 @@
 #include "r600_asm.h"
 #include "eg_sq.h"
 #include "r600_opcodes.h"
+#include "evergreend.h"
 
 int eg_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf)
 {
@@ -89,3 +90,37 @@ int eg_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf)
 	}
 	return 0;
 }
+
+void eg_cf_vtx(struct r600_vertex_element *ve, u32 *bytecode, unsigned count)
+{
+	struct r600_pipe_state *rstate;
+	unsigned i = 0;
+
+	if (count > 8) {
+		bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
+		bytecode[i++] = S_SQ_CF_WORD1_CF_INST(EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
+				S_SQ_CF_WORD1_BARRIER(1) |
+				S_SQ_CF_WORD1_COUNT(8 - 1);
+		bytecode[i++] = S_SQ_CF_WORD0_ADDR(40 >> 1);
+		bytecode[i++] = S_SQ_CF_WORD1_CF_INST(EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
+				S_SQ_CF_WORD1_BARRIER(1) |
+				S_SQ_CF_WORD1_COUNT(count - 8 - 1);
+	} else {
+		bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
+		bytecode[i++] = S_SQ_CF_WORD1_CF_INST(EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
+				S_SQ_CF_WORD1_BARRIER(1) |
+				S_SQ_CF_WORD1_COUNT(count - 1);
+	}
+	bytecode[i++] = S_SQ_CF_WORD0_ADDR(0);
+	bytecode[i++] = S_SQ_CF_WORD1_CF_INST(EG_V_SQ_CF_WORD1_SQ_CF_INST_RETURN) |
+			S_SQ_CF_WORD1_BARRIER(1);
+
+	rstate = &ve->rstate;
+	rstate->id = R600_PIPE_STATE_FETCH_SHADER;
+	rstate->nregs = 0;
+	r600_pipe_state_add_reg(rstate, R_0288A8_SQ_PGM_RESOURCES_FS,
+				0x00000000, 0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate, R_0288A4_SQ_PGM_START_FS,
+				(r600_bo_offset(ve->fetch_shader)) >> 8,
+				0xFFFFFFFF, ve->fetch_shader);
+}
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index ebd541d..b313d52 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -1259,6 +1259,90 @@ void evergreen_polygon_offset_update(struct r600_pipe_context *rctx)
 	}
 }
 
+void evergreen_vertex_buffer_update(struct r600_pipe_context *rctx)
+{
+	struct r600_pipe_state *rstate;
+	struct r600_resource *rbuffer;
+	struct pipe_vertex_buffer *vertex_buffer;
+	unsigned i, offset;
+
+	/* we don't update until we know vertex elements */
+	if (rctx->vertex_elements == NULL || !rctx->nvertex_buffer)
+		return;
+
+	/* delete previous translated vertex elements */
+	if (rctx->tran.new_velems) {
+		r600_end_vertex_translate(rctx);
+	}
+
+	if (rctx->vertex_elements->incompatible_layout) {
+		/* translate rebind new vertex elements so
+		 * return once translated
+		 */
+		r600_begin_vertex_translate(rctx);
+		return;
+	}
+
+	if (rctx->any_user_vbs) {
+		r600_upload_user_buffers(rctx);
+		rctx->any_user_vbs = FALSE;
+	}
+
+	if (rctx->vertex_elements->vbuffer_need_offset) {
+		/* one resource per vertex elements */
+		rctx->nvs_resource = rctx->vertex_elements->count;
+	} else {
+		/* bind vertex buffer once */
+		rctx->nvs_resource = rctx->nvertex_buffer;
+	}
+
+	for (i = 0 ; i < rctx->nvs_resource; i++) {
+		rstate = &rctx->vs_resource[i];
+		rstate->id = R600_PIPE_STATE_RESOURCE;
+		rstate->nregs = 0;
+
+		if (rctx->vertex_elements->vbuffer_need_offset) {
+			/* one resource per vertex elements */
+			unsigned vbuffer_index;
+			vbuffer_index = rctx->vertex_elements->elements[i].vertex_buffer_index;
+			vertex_buffer = &rctx->vertex_buffer[vbuffer_index];
+			rbuffer = (struct r600_resource*)vertex_buffer->buffer;
+			offset = rctx->vertex_elements->vbuffer_offset[i] +
+				vertex_buffer->buffer_offset +
+				r600_bo_offset(rbuffer->bo);
+		} else {
+			/* bind vertex buffer once */
+			vertex_buffer = &rctx->vertex_buffer[i];
+			rbuffer = (struct r600_resource*)vertex_buffer->buffer;
+			offset = vertex_buffer->buffer_offset +
+				r600_bo_offset(rbuffer->bo);
+		}
+
+		r600_pipe_state_add_reg(rstate, R_030000_RESOURCE0_WORD0,
+					offset, 0xFFFFFFFF, rbuffer->bo);
+		r600_pipe_state_add_reg(rstate, R_030004_RESOURCE0_WORD1,
+					rbuffer->size - offset - 1, 0xFFFFFFFF, NULL);
+		r600_pipe_state_add_reg(rstate, R_030008_RESOURCE0_WORD2,
+					S_030008_STRIDE(vertex_buffer->stride),
+					0xFFFFFFFF, NULL);
+		r600_pipe_state_add_reg(rstate, R_03000C_RESOURCE0_WORD3,
+					S_03000C_DST_SEL_X(V_03000C_SQ_SEL_X) |
+					S_03000C_DST_SEL_Y(V_03000C_SQ_SEL_Y) |
+					S_03000C_DST_SEL_Z(V_03000C_SQ_SEL_Z) |
+					S_03000C_DST_SEL_W(V_03000C_SQ_SEL_W),
+					0xFFFFFFFF, NULL);
+		r600_pipe_state_add_reg(rstate, R_030010_RESOURCE0_WORD4,
+					0x00000000, 0xFFFFFFFF, NULL);
+		r600_pipe_state_add_reg(rstate, R_030014_RESOURCE0_WORD5,
+					0x00000000, 0xFFFFFFFF, NULL);
+		r600_pipe_state_add_reg(rstate, R_030018_RESOURCE0_WORD6,
+					0x00000000, 0xFFFFFFFF, NULL);
+		r600_pipe_state_add_reg(rstate, R_03001C_RESOURCE0_WORD7,
+					0xC0000000, 0xFFFFFFFF, NULL);
+		evergreen_fs_resource_set(&rctx->ctx, rstate, i);
+	}
+}
+
 int r600_conv_pipe_prim(unsigned pprim, unsigned *prim);
 void evergreen_draw(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
@@ -1273,6 +1357,7 @@ void evergreen_draw(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	struct r600_drawl draw;
 	boolean translate = FALSE;
 
+#if 0
 	if (rctx->vertex_elements->incompatible_layout) {
 		r600_begin_vertex_translate(rctx);
 		translate = TRUE;
@@ -1282,6 +1367,7 @@ void evergreen_draw(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		r600_upload_user_buffers(rctx);
 		rctx->any_user_vbs = FALSE;
 	}
+#endif
 
 	memset(&draw, 0, sizeof(struct r600_drawl));
 	draw.ctx = ctx;
@@ -1338,6 +1424,7 @@ void evergreen_draw(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	if (r600_pipe_shader_update(&rctx->context, rctx->ps_shader))
 		return;
 
+#if 0
 	for (i = 0 ; i < rctx->vertex_elements->count; i++) {
 		uint32_t word3, word2;
 		uint32_t format;
@@ -1372,6 +1459,7 @@ void evergreen_draw(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		r600_pipe_state_add_reg(rstate, R_03001C_RESOURCE0_WORD7, 0xC0000000, 0xFFFFFFFF, NULL);
 		evergreen_fs_resource_set(&rctx->ctx, rstate, i);
 	}
+#endif
 
 	mask = 0;
 	for (int i = 0; i < rctx->framebuffer.nr_cbufs; i++) {
@@ -1588,14 +1676,17 @@ void evergreen_pipe_shader_vs(struct pipe_context *ctx, struct r600_pipe_shader
 				R_028864_SQ_PGM_RESOURCES_2_VS,
 				0x0, 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate,
-			R_0288A8_SQ_PGM_RESOURCES_FS,
-			0x00000000, 0xFFFFFFFF, NULL);
-	r600_pipe_state_add_reg(rstate,
 			R_02885C_SQ_PGM_START_VS,
 			(r600_bo_offset(shader->bo)) >> 8, 0xFFFFFFFF, shader->bo);
+
+#if 0
+	r600_pipe_state_add_reg(rstate,
+			R_0288A8_SQ_PGM_RESOURCES_FS,
+			0x00000000, 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate,
 			R_0288A4_SQ_PGM_START_FS,
 			(r600_bo_offset(shader->bo)) >> 8, 0xFFFFFFFF, shader->bo_fetch);
+#endif
 
 	r600_pipe_state_add_reg(rstate,
 				R_03A200_SQ_LOOP_CONST_0 + (32 * 4), 0x01000FFF,
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 73daa00..e13c606 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -22,12 +22,15 @@
  */
 #include <stdio.h>
 #include <errno.h>
+#include "util/u_format.h"
 #include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 #include "r600_pipe.h"
 #include "r600_sq.h"
 #include "r600_opcodes.h"
 #include "r600_asm.h"
+#include "r600_formats.h"
+#include "r600d.h"
 
 static inline unsigned int r600_bc_get_num_operands(struct r600_bc_alu *alu)
 {
@@ -972,3 +975,317 @@ void r600_bc_dump(struct r600_bc *bc)
 	}
 	fprintf(stderr, "--------------------------------------\n");
 }
+
+void r600_cf_vtx(struct r600_vertex_element *ve, u32 *bytecode, unsigned count)
+{
+	struct r600_pipe_state *rstate;
+	unsigned i = 0;
+
+	if (count > 8) {
+		bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
+		bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
+						S_SQ_CF_WORD1_BARRIER(1) |
+						S_SQ_CF_WORD1_COUNT(8 - 1);
+		bytecode[i++] = S_SQ_CF_WORD0_ADDR(40 >> 1);
+		bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
+						S_SQ_CF_WORD1_BARRIER(1) |
+						S_SQ_CF_WORD1_COUNT(count - 8 - 1);
+	} else {
+		bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
+		bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
+						S_SQ_CF_WORD1_BARRIER(1) |
+						S_SQ_CF_WORD1_COUNT(count - 1);
+	}
+	bytecode[i++] = S_SQ_CF_WORD0_ADDR(0);
+	bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_RETURN) |
+			S_SQ_CF_WORD1_BARRIER(1);
+
+	rstate = &ve->rstate;
+	rstate->id = R600_PIPE_STATE_FETCH_SHADER;
+	rstate->nregs = 0;
+	r600_pipe_state_add_reg(rstate, R_0288A4_SQ_PGM_RESOURCES_FS,
+				0x00000000, 0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate, R_0288DC_SQ_PGM_CF_OFFSET_FS,
+				0x00000000, 0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate, R_028894_SQ_PGM_START_FS,
+				r600_bo_offset(ve->fetch_shader) >> 8,
+				0xFFFFFFFF, ve->fetch_shader);
+}
+
+void r600_cf_vtx_tc(struct r600_vertex_element *ve, u32 *bytecode, unsigned count)
+{
+	struct r600_pipe_state *rstate;
+	unsigned i = 0;
+
+	if (count > 8) {
+		bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
+		bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC) |
+						S_SQ_CF_WORD1_BARRIER(1) |
+						S_SQ_CF_WORD1_COUNT(8 - 1);
+		bytecode[i++] = S_SQ_CF_WORD0_ADDR(40 >> 1);
+		bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC) |
+						S_SQ_CF_WORD1_BARRIER(1) |
+						S_SQ_CF_WORD1_COUNT((count - 8) - 1);
+	} else {
+		bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
+		bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC) |
+						S_SQ_CF_WORD1_BARRIER(1) |
+						S_SQ_CF_WORD1_COUNT(count - 1);
+	}
+	bytecode[i++] = S_SQ_CF_WORD0_ADDR(0);
+	bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_RETURN) |
+			S_SQ_CF_WORD1_BARRIER(1);
+
+	rstate = &ve->rstate;
+	rstate->id = R600_PIPE_STATE_FETCH_SHADER;
+	rstate->nregs = 0;
+	r600_pipe_state_add_reg(rstate, R_0288A4_SQ_PGM_RESOURCES_FS,
+				0x00000000, 0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate, R_0288DC_SQ_PGM_CF_OFFSET_FS,
+				0x00000000, 0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate, R_028894_SQ_PGM_START_FS,
+				r600_bo_offset(ve->fetch_shader) >> 8,
+				0xFFFFFFFF, ve->fetch_shader);
+}
+
+static void r600_vertex_data_type(enum pipe_format pformat, unsigned *format,
+				unsigned *num_format, unsigned *format_comp)
+{
+	const struct util_format_description *desc;
+	unsigned i;
+
+	*format = 0;
+	*num_format = 0;
+	*format_comp = 0;
+
+	desc = util_format_description(pformat);
+	if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
+		goto out_unknown;
+	}
+
+	/* Find the first non-VOID channel. */
+	for (i = 0; i < 4; i++) {
+		if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
+			break;
+		}
+	}
+
+	switch (desc->channel[i].type) {
+		/* Half-floats, floats, doubles */
+	case UTIL_FORMAT_TYPE_FLOAT:
+		switch (desc->channel[i].size) {
+		case 16:
+			switch (desc->nr_channels) {
+			case 1:
+				*format = FMT_16_FLOAT;
+				break;
+			case 2:
+				*format = FMT_16_16_FLOAT;
+				break;
+			case 3:
+				*format = FMT_16_16_16_FLOAT;
+				break;
+			case 4:
+				*format = FMT_16_16_16_16_FLOAT;
+				break;
+			}
+			break;
+		case 32:
+			switch (desc->nr_channels) {
+			case 1:
+				*format = FMT_32_FLOAT;
+				break;
+			case 2:
+				*format = FMT_32_32_FLOAT;
+				break;
+			case 3:
+				*format = FMT_32_32_32_FLOAT;
+				break;
+			case 4:
+				*format = FMT_32_32_32_32_FLOAT;
+				break;
+			}
+			break;
+		default:
+			goto out_unknown;
+		}
+		break;
+		/* Unsigned ints */
+	case UTIL_FORMAT_TYPE_UNSIGNED:
+		/* Signed ints */
+	case UTIL_FORMAT_TYPE_SIGNED:
+		switch (desc->channel[i].size) {
+		case 8:
+			switch (desc->nr_channels) {
+			case 1:
+				*format = FMT_8;
+				break;
+			case 2:
+				*format = FMT_8_8;
+				break;
+			case 3:
+			//	*format = FMT_8_8_8; /* fails piglit draw-vertices test */
+			//	break;
+			case 4:
+				*format = FMT_8_8_8_8;
+				break;
+			}
+			break;
+		case 16:
+			switch (desc->nr_channels) {
+			case 1:
+				*format = FMT_16;
+				break;
+			case 2:
+				*format = FMT_16_16;
+				break;
+			case 3:
+			//	*format = FMT_16_16_16; /* fails piglit draw-vertices test */
+			//	break;
+			case 4:
+				*format = FMT_16_16_16_16;
+				break;
+			}
+			break;
+		case 32:
+			switch (desc->nr_channels) {
+			case 1:
+				*format = FMT_32;
+				break;
+			case 2:
+				*format = FMT_32_32;
+				break;
+			case 3:
+				*format = FMT_32_32_32;
+				break;
+			case 4:
+				*format = FMT_32_32_32_32;
+				break;
+			}
+			break;
+		default:
+			goto out_unknown;
+		}
+		break;
+	default:
+		goto out_unknown;
+	}
+
+	if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
+		*format_comp = 1;
+	}
+	if (desc->channel[i].normalized) {
+		*num_format = 0;
+	} else {
+		*num_format = 2;
+	}
+	return;
+out_unknown:
+	R600_ERR("unsupported vertex format %s\n", util_format_name(pformat));
+}
+
+void r600_bc(unsigned ndw, unsigned chiprev, u32 *bytecode)
+{
+	unsigned i;
+	char chip = '6';
+
+	switch (chiprev) {
+	case 1:
+		chip = '7';
+		break;
+	case 2:
+		chip = 'E';
+		break;
+	case 0:
+	default:
+		chip = '6';
+		break;
+	}
+	fprintf(stderr, "bytecode %d dw -----------------------\n", ndw);
+	fprintf(stderr, "    %c\n", chip);
+	for (i = 0; i < ndw; i++) {
+		fprintf(stderr, "0x%08X\n", bytecode[i]);
+	}
+	fprintf(stderr, "--------------------------------------\n");
+}
+
+int r600_vertex_elements_build_fetch_shader(struct r600_pipe_context *rctx, struct r600_vertex_element *ve)
+{
+	unsigned ndw, i;
+	u32 *bytecode;
+	unsigned fetch_resource_start = 0, format, num_format, format_comp;
+	struct pipe_vertex_element *elements = ve->elements;
+	const struct util_format_description *desc;
+
+	/* 2 dwords for cf aligned to 4 + 4 dwords per input */
+	ndw = 8 + ve->count * 4;
+	ve->fs_size = ndw * 4;
+
+	/* use PIPE_BIND_VERTEX_BUFFER so we use the cache buffer manager */
+	ve->fetch_shader = r600_bo(rctx->radeon, ndw*4, 256, PIPE_BIND_VERTEX_BUFFER, 0);
+	if (ve->fetch_shader == NULL) {
+		return -ENOMEM;
+	}
+
+	bytecode = r600_bo_map(rctx->radeon, ve->fetch_shader, 0, NULL);
+	if (bytecode == NULL) {
+		r600_bo_reference(rctx->radeon, &ve->fetch_shader, NULL);
+		return -ENOMEM;
+	}
+
+	if (rctx->family >= CHIP_CEDAR) {
+		eg_cf_vtx(ve, &bytecode[0], (ndw - 8) / 4);
+	} else {
+		r600_cf_vtx(ve, &bytecode[0], (ndw - 8) / 4);
+		fetch_resource_start = 160;
+	}
+
+	/* vertex elements offset need special handling, if offset is bigger
+	 * than what we can put in fetch instruction then we need to alterate
+	 * the vertex resource offset. In such case in order to simplify code
+	 * we will bound one resource per elements. It's a worst case scenario.
+	 */
+	for (i = 0; i < ve->count; i++) {
+		ve->vbuffer_offset[i] = C_SQ_VTX_WORD2_OFFSET & elements[i].src_offset;
+		if (ve->vbuffer_offset[i]) {
+			ve->vbuffer_need_offset = 1;
+		}
+	}
+
+	for (i = 0; i < ve->count; i++) {
+		unsigned vbuffer_index;
+		r600_vertex_data_type(ve->hw_format[i], &format, &num_format, &format_comp);
+		desc = util_format_description(ve->hw_format[i]);
+		if (desc == NULL) {
+			R600_ERR("unknown format %d\n", ve->hw_format[i]);
+			r600_bo_reference(rctx->radeon, &ve->fetch_shader, NULL);
+			return -EINVAL;
+		}
+
+		/* see above for vbuffer_need_offset explanation */
+		vbuffer_index = elements[i].vertex_buffer_index;
+		if (ve->vbuffer_need_offset) {
+			bytecode[8 + i * 4 + 0] = S_SQ_VTX_WORD0_BUFFER_ID(i + fetch_resource_start);
+		} else {
+			bytecode[8 + i * 4 + 0] = S_SQ_VTX_WORD0_BUFFER_ID(vbuffer_index + fetch_resource_start);
+		}
+		bytecode[8 + i * 4 + 0] |= S_SQ_VTX_WORD0_SRC_GPR(0) |
+					S_SQ_VTX_WORD0_SRC_SEL_X(0) |
+					S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(0x1F);
+		bytecode[8 + i * 4 + 1] = S_SQ_VTX_WORD1_DST_SEL_X(desc->swizzle[0]) |
+					S_SQ_VTX_WORD1_DST_SEL_Y(desc->swizzle[1]) |
+					S_SQ_VTX_WORD1_DST_SEL_Z(desc->swizzle[2]) |
+					S_SQ_VTX_WORD1_DST_SEL_W(desc->swizzle[3]) |
+					S_SQ_VTX_WORD1_USE_CONST_FIELDS(0) |
+					S_SQ_VTX_WORD1_DATA_FORMAT(format) |
+					S_SQ_VTX_WORD1_NUM_FORMAT_ALL(num_format) |
+					S_SQ_VTX_WORD1_FORMAT_COMP_ALL(format_comp) |
+					S_SQ_VTX_WORD1_SRF_MODE_ALL(1) |
+					S_SQ_VTX_WORD1_GPR_DST_GPR(i + 1);
+		bytecode[8 + i * 4 + 2] = S_SQ_VTX_WORD2_OFFSET(elements[i].src_offset) |
+					S_SQ_VTX_WORD2_MEGA_FETCH(1);
+		bytecode[8 + i * 4 + 3] = 0;
+	}
+	r600_bo_unmap(rctx->radeon, ve->fetch_shader);
+	return 0;
+}
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
index 1be5e4a..b147f0f 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -28,6 +28,9 @@
 #define NUM_OF_CYCLES 3
 #define NUM_OF_COMPONENTS 4
 
+struct r600_vertex_element;
+struct r600_pipe_context;
+
 struct r600_bc_alu_src {
 	unsigned			sel;
 	unsigned			chan;
@@ -188,6 +191,7 @@ struct r600_bc {
 
 /* eg_asm.c */
 int eg_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf);
+void eg_cf_vtx(struct r600_vertex_element *ve, u32 *bytecode, unsigned count);
 
 /* r600_asm.c */
 int r600_bc_init(struct r600_bc *bc, enum radeon_family family);
@@ -201,6 +205,10 @@ int r600_bc_build(struct r600_bc *bc);
 int r600_bc_add_cfinst(struct r600_bc *bc, int inst);
 int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int type);
 void r600_bc_dump(struct r600_bc *bc);
+void r600_cf_vtx(struct r600_vertex_element *ve, u32 *bytecode, unsigned count);
+void r600_cf_vtx_tc(struct r600_vertex_element *ve, u32 *bytecode, unsigned count);
+
+int r600_vertex_elements_build_fetch_shader(struct r600_pipe_context *rctx, struct r600_vertex_element *ve);
 
 /* r700_asm.c */
 int r700_bc_alu_build(struct r600_bc *bc, struct r600_bc_alu *alu, unsigned id);
diff --git a/src/gallium/drivers/r600/r600_buffer.c b/src/gallium/drivers/r600/r600_buffer.c
index 51b8aba..03a61a3 100644
--- a/src/gallium/drivers/r600/r600_buffer.c
+++ b/src/gallium/drivers/r600/r600_buffer.c
@@ -267,10 +267,11 @@ int r600_upload_user_buffers(struct r600_pipe_context *rctx)
 	int i, nr;
 
 	nr = rctx->vertex_elements->count;
+	nr = rctx->nvertex_buffer;
 
 	for (i = 0; i < nr; i++) {
-		struct pipe_vertex_buffer *vb =
-			&rctx->vertex_buffer[rctx->vertex_elements->elements[i].vertex_buffer_index];
+//		struct pipe_vertex_buffer *vb = &rctx->vertex_buffer[rctx->vertex_elements->elements[i].vertex_buffer_index];
+		struct pipe_vertex_buffer *vb = &rctx->vertex_buffer[i];
 
 		if (r600_buffer_is_user_buffer(vb->buffer)) {
 			struct pipe_resource *upload_buffer = NULL;
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index fa0b635..ea57fba 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -90,6 +90,8 @@ static void r600_destroy_context(struct pipe_context *context)
 	u_upload_destroy(rctx->upload_vb);
 	u_upload_destroy(rctx->upload_ib);
 
+	r600_end_vertex_translate(rctx);
+
 	if (rctx->tran.translate_cache)
 		translate_cache_destroy(rctx->tran.translate_cache);
 
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index deec946..ce9f99a 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -54,6 +54,7 @@ enum r600_pipe_state_id {
 	R600_PIPE_STATE_SAMPLER,
 	R600_PIPE_STATE_RESOURCE,
 	R600_PIPE_STATE_POLYGON_OFFSET,
+	R600_PIPE_STATE_FETCH_SHADER,
 	R600_PIPE_NSTATES
 };
 
@@ -87,7 +88,15 @@ struct r600_vertex_element
 	struct pipe_vertex_element	elements[PIPE_MAX_ATTRIBS];
 	enum pipe_format		hw_format[PIPE_MAX_ATTRIBS];
 	unsigned			hw_format_size[PIPE_MAX_ATTRIBS];
-	boolean incompatible_layout;
+	boolean				incompatible_layout;
+	struct r600_bo			*fetch_shader;
+	unsigned			fs_size;
+	struct r600_pipe_state		rstate;
+	/* if offset is to big for fetch instructio we need to alterate
+	 * offset of vertex buffer, record here the offset need to add
+	 */
+	unsigned			vbuffer_need_offset;
+	unsigned			vbuffer_offset[PIPE_MAX_ATTRIBS];
 };
 
 struct r600_pipe_shader {
@@ -108,14 +117,14 @@ struct r600_textures_info {
 	unsigned			n_samplers;
 };
 
+/* vertex buffer translation context, used to translate vertex input that
+ * hw doesn't natively support, so far only FLOAT64 is unsupported.
+ */
 struct r600_translate_context {
 	/* Translate cache for incompatible vertex offset/stride/format fallback. */
 	struct translate_cache		*translate_cache;
-
 	/* The vertex buffer slot containing the translated buffer. */
 	unsigned			vb_slot;
-	/* Saved and new vertex element state. */
-	void				*saved_velems;
 	void				*new_velems;
 };
 
@@ -142,6 +151,7 @@ struct r600_pipe_context {
 	struct pipe_stencil_ref		stencil_ref;
 	struct pipe_viewport_state	viewport;
 	struct pipe_clip_state		clip;
+	unsigned			nvs_resource;
 	struct r600_pipe_state		*vs_resource;
 	struct r600_pipe_state		*ps_resource;
 	struct r600_pipe_state		config;
@@ -182,6 +192,7 @@ void evergreen_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shader
 void evergreen_pipe_shader_vs(struct pipe_context *ctx, struct r600_pipe_shader *shader);
 void *evergreen_create_db_flush_dsa(struct r600_pipe_context *rctx);
 void evergreen_polygon_offset_update(struct r600_pipe_context *rctx);
+void evergreen_vertex_buffer_update(struct r600_pipe_context *rctx);
 
 /* r600_blit.c */
 void r600_init_blit_functions(struct r600_pipe_context *rctx);
@@ -220,6 +231,7 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info);
 void r600_init_config(struct r600_pipe_context *rctx);
 void *r600_create_db_flush_dsa(struct r600_pipe_context *rctx);
 void r600_polygon_offset_update(struct r600_pipe_context *rctx);
+void r600_vertex_buffer_update(struct r600_pipe_context *rctx);
 
 /* r600_helper.h */
 int r600_conv_pipe_prim(unsigned pprim, unsigned *prim);
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index f53124d..e40cd1d 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -67,21 +67,23 @@ static void r600_pipe_shader_vs(struct pipe_context *ctx, struct r600_pipe_shade
 			S_028868_STACK_SIZE(rshader->bc.nstack),
 			0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate,
-			R_0288A4_SQ_PGM_RESOURCES_FS,
+			R_0288D0_SQ_PGM_CF_OFFSET_VS,
 			0x00000000, 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate,
-			R_0288D0_SQ_PGM_CF_OFFSET_VS,
+			R_028858_SQ_PGM_START_VS,
+			r600_bo_offset(shader->bo) >> 8, 0xFFFFFFFF, shader->bo);
+
+#if 0
+	r600_pipe_state_add_reg(rstate,
+			R_0288A4_SQ_PGM_RESOURCES_FS,
 			0x00000000, 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate,
 			R_0288DC_SQ_PGM_CF_OFFSET_FS,
 			0x00000000, 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate,
-			R_028858_SQ_PGM_START_VS,
-			r600_bo_offset(shader->bo) >> 8, 0xFFFFFFFF, shader->bo);
-	r600_pipe_state_add_reg(rstate,
 			R_028894_SQ_PGM_START_FS,
 			r600_bo_offset(shader->bo_fetch) >> 8, 0xFFFFFFFF, shader->bo_fetch);
-
+#endif
 	r600_pipe_state_add_reg(rstate,
 				R_03E200_SQ_LOOP_CONST_0 + (32 * 4), 0x01000FFF,
 				0xFFFFFFFF, NULL);
@@ -261,6 +263,7 @@ static int r600_pipe_shader(struct pipe_context *ctx, struct r600_pipe_shader *s
 
 static int r600_shader_update(struct pipe_context *ctx, struct r600_pipe_shader *rshader)
 {
+#if 0
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	struct r600_shader *shader = &rshader->shader;
 	const struct util_format_description *desc;
@@ -304,6 +307,9 @@ static int r600_shader_update(struct pipe_context *ctx, struct r600_pipe_shader
 		}
 	}
 	return r600_bc_build(&shader->bc_fetch);
+#else
+	return 0;
+#endif
 }
 
 int r600_pipe_shader_update(struct pipe_context *ctx, struct r600_pipe_shader *shader)
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index c592ef2..9b70942 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -94,6 +94,84 @@ void r600_polygon_offset_update(struct r600_pipe_context *rctx)
 	}
 }
 
+void r600_vertex_buffer_update(struct r600_pipe_context *rctx)
+{
+	struct r600_pipe_state *rstate;
+	struct r600_resource *rbuffer;
+	struct pipe_vertex_buffer *vertex_buffer;
+	unsigned i, offset;
+
+	/* we don't update until we know vertex elements */
+	if (rctx->vertex_elements == NULL || !rctx->nvertex_buffer)
+		return;
+
+	/* delete previous translated vertex elements */
+	if (rctx->tran.new_velems) {
+		r600_end_vertex_translate(rctx);
+	}
+
+	if (rctx->vertex_elements->incompatible_layout) {
+		/* translate rebind new vertex elements so
+		 * return once translated
+		 */
+		r600_begin_vertex_translate(rctx);
+		return;
+	}
+
+	if (rctx->any_user_vbs) {
+		r600_upload_user_buffers(rctx);
+		rctx->any_user_vbs = FALSE;
+	}
+
+	if (rctx->vertex_elements->vbuffer_need_offset) {
+		/* one resource per vertex elements */
+		rctx->nvs_resource = rctx->vertex_elements->count;
+	} else {
+		/* bind vertex buffer once */
+		rctx->nvs_resource = rctx->nvertex_buffer;
+	}
+
+	for (i = 0 ; i < rctx->nvs_resource; i++) {
+		rstate = &rctx->vs_resource[i];
+		rstate->id = R600_PIPE_STATE_RESOURCE;
+		rstate->nregs = 0;
+
+		if (rctx->vertex_elements->vbuffer_need_offset) {
+			/* one resource per vertex elements */
+			unsigned vbuffer_index;
+			vbuffer_index = rctx->vertex_elements->elements[i].vertex_buffer_index;
+			vertex_buffer = &rctx->vertex_buffer[vbuffer_index];
+			rbuffer = (struct r600_resource*)vertex_buffer->buffer;
+			offset = rctx->vertex_elements->vbuffer_offset[i] +
+				vertex_buffer->buffer_offset +
+				r600_bo_offset(rbuffer->bo);
+		} else {
+			/* bind vertex buffer once */
+			vertex_buffer = &rctx->vertex_buffer[i];
+			rbuffer = (struct r600_resource*)vertex_buffer->buffer;
+			offset = vertex_buffer->buffer_offset +
+				r600_bo_offset(rbuffer->bo);
+		}
+
+		r600_pipe_state_add_reg(rstate, R_038000_RESOURCE0_WORD0,
+					offset, 0xFFFFFFFF, rbuffer->bo);
+		r600_pipe_state_add_reg(rstate, R_038004_RESOURCE0_WORD1,
+					rbuffer->size - offset - 1, 0xFFFFFFFF, NULL);
+		r600_pipe_state_add_reg(rstate, R_038008_RESOURCE0_WORD2,
+					S_038008_STRIDE(vertex_buffer->stride),
+					0xFFFFFFFF, NULL);
+		r600_pipe_state_add_reg(rstate, R_03800C_RESOURCE0_WORD3,
+					0x00000000, 0xFFFFFFFF, NULL);
+		r600_pipe_state_add_reg(rstate, R_038010_RESOURCE0_WORD4,
+					0x00000000, 0xFFFFFFFF, NULL);
+		r600_pipe_state_add_reg(rstate, R_038014_RESOURCE0_WORD5,
+					0x00000000, 0xFFFFFFFF, NULL);
+		r600_pipe_state_add_reg(rstate, R_038018_RESOURCE0_WORD6,
+					0xC0000000, 0xFFFFFFFF, NULL);
+		r600_context_pipe_state_set_fs_resource(&rctx->ctx, rstate, i);
+	}
+}
+
 static void r600_draw_common(struct r600_drawl *draw)
 {
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)draw->ctx;
@@ -132,6 +210,7 @@ static void r600_draw_common(struct r600_drawl *draw)
 	if (r600_pipe_shader_update(&rctx->context, rctx->ps_shader))
 		return;
 
+#if 0
 	for (i = 0 ; i < rctx->vertex_elements->count; i++) {
 		uint32_t word2, format;
 
@@ -159,6 +238,7 @@ static void r600_draw_common(struct r600_drawl *draw)
 		r600_pipe_state_add_reg(rstate, R_038018_RESOURCE0_WORD6, 0xC0000000, 0xFFFFFFFF, NULL);
 		r600_context_pipe_state_set_fs_resource(&rctx->ctx, rstate, i);
 	}
+#endif
 
 	mask = 0;
 	for (int i = 0; i < rctx->framebuffer.nr_cbufs; i++) {
@@ -195,6 +275,7 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	struct r600_drawl draw;
 	boolean translate = FALSE;
 
+#if 0
 	if (rctx->vertex_elements->incompatible_layout) {
 		r600_begin_vertex_translate(rctx);
 		translate = TRUE;
@@ -204,6 +285,8 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		r600_upload_user_buffers(rctx);
 		rctx->any_user_vbs = FALSE;
 	}
+#endif
+
 	memset(&draw, 0, sizeof(struct r600_drawl));
 	draw.ctx = ctx;
 	draw.mode = info->mode;
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 856f791..8894327 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -121,6 +121,16 @@ void r600_bind_vertex_elements(struct pipe_context *ctx, void *state)
 
 	rctx->vertex_elements = v;
 	if (v) {
+		rctx->states[v->rstate.id] = &v->rstate;
+		r600_context_pipe_state_set(&rctx->ctx, &v->rstate);
+		if (rctx->family >= CHIP_CEDAR) {
+			evergreen_vertex_buffer_update(rctx);
+		} else {
+			r600_vertex_buffer_update(rctx);
+		}
+	}
+
+	if (v) {
 //		rctx->vs_rebuild = TRUE;
 	}
 }
@@ -128,11 +138,16 @@ void r600_bind_vertex_elements(struct pipe_context *ctx, void *state)
 void r600_delete_vertex_element(struct pipe_context *ctx, void *state)
 {
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+	struct r600_vertex_element *v = (struct r600_vertex_element*)state;
 
-	FREE(state);
-
+	if (rctx->states[v->rstate.id] == &v->rstate) {
+		rctx->states[v->rstate.id] = NULL;
+	}
 	if (rctx->vertex_elements == state)
 		rctx->vertex_elements = NULL;
+
+	r600_bo_reference(rctx->radeon, &v->fetch_shader, NULL);
+	FREE(state);
 }
 
 
@@ -182,6 +197,11 @@ void r600_set_vertex_buffers(struct pipe_context *ctx, unsigned count,
 	}
 	rctx->nvertex_buffer = count;
 	rctx->vb_max_index = max_index;
+	if (rctx->family >= CHIP_CEDAR) {
+		evergreen_vertex_buffer_update(rctx);
+	} else {
+		r600_vertex_buffer_update(rctx);
+	}
 }
 
 
@@ -192,9 +212,10 @@ void *r600_create_vertex_elements(struct pipe_context *ctx,
 				  unsigned count,
 				  const struct pipe_vertex_element *elements)
 {
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	struct r600_vertex_element *v = CALLOC_STRUCT(r600_vertex_element);
-	int i;
 	enum pipe_format *format;
+	int i;
 
 	assert(count < 32);
 	if (!v)
@@ -216,12 +237,16 @@ void *r600_create_vertex_elements(struct pipe_context *ctx,
 		}
 		v->incompatible_layout =
 			v->incompatible_layout ||
-			v->elements[i].src_format != v->hw_format[i] ||
-			v->elements[i].src_offset % 4 != 0;
+			v->elements[i].src_format != v->hw_format[i];
 
 		v->hw_format_size[i] = align(util_format_get_blocksize(v->hw_format[i]), 4);
 	}
 
+	if (r600_vertex_elements_build_fetch_shader(rctx, v)) {
+		FREE(v);
+		return NULL;
+	}
+
 	return v;
 }
 
diff --git a/src/gallium/drivers/r600/r600_translate.c b/src/gallium/drivers/r600/r600_translate.c
index 2e082f1..d927f53 100644
--- a/src/gallium/drivers/r600/r600_translate.c
+++ b/src/gallium/drivers/r600/r600_translate.c
@@ -41,6 +41,7 @@ void r600_begin_vertex_translate(struct r600_pipe_context *rctx)
 	struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS] = {0}, *out_transfer;
 	struct pipe_resource *out_buffer;
 	unsigned i, num_verts;
+	struct pipe_vertex_element new_velems[PIPE_MAX_ATTRIBS];
 
 	/* Initialize the translate key, i.e. the recipe how vertices should be
 	 * translated. */
@@ -51,9 +52,7 @@ void r600_begin_vertex_translate(struct r600_pipe_context *rctx)
 		unsigned output_format_size = ve->hw_format_size[i];
 
 		/* Check for support. */
-		if (ve->elements[i].src_format == ve->hw_format[i] &&
-		    (vb->buffer_offset + ve->elements[i].src_offset) % 4 == 0 &&
-		    vb->stride % 4 == 0) {
+		if (ve->elements[i].src_format == ve->hw_format[i]) {
 			continue;
 		}
 
@@ -147,29 +146,22 @@ void r600_begin_vertex_translate(struct r600_pipe_context *rctx)
 	}
 
 	/* Save and replace vertex elements. */
-	{
-		struct pipe_vertex_element new_velems[PIPE_MAX_ATTRIBS];
-
-		rctx->tran.saved_velems = rctx->vertex_elements;
-
-		for (i = 0; i < ve->count; i++) {
-			if (vb_translated[ve->elements[i].vertex_buffer_index]) {
-				te = &key.element[tr_elem_index[i]];
-				new_velems[i].instance_divisor = ve->elements[i].instance_divisor;
-				new_velems[i].src_format = te->output_format;
-				new_velems[i].src_offset = te->output_offset;
-				new_velems[i].vertex_buffer_index = rctx->tran.vb_slot;
-			} else {
-				memcpy(&new_velems[i], &ve->elements[i],
-				       sizeof(struct pipe_vertex_element));
-			}
+	for (i = 0; i < ve->count; i++) {
+		if (vb_translated[ve->elements[i].vertex_buffer_index]) {
+			te = &key.element[tr_elem_index[i]];
+			new_velems[i].instance_divisor = ve->elements[i].instance_divisor;
+			new_velems[i].src_format = te->output_format;
+			new_velems[i].src_offset = te->output_offset;
+			new_velems[i].vertex_buffer_index = rctx->tran.vb_slot;
+		} else {
+			memcpy(&new_velems[i], &ve->elements[i],
+					sizeof(struct pipe_vertex_element));
 		}
-
-		rctx->tran.new_velems =
-			pipe->create_vertex_elements_state(pipe, ve->count, new_velems);
-		pipe->bind_vertex_elements_state(pipe, rctx->tran.new_velems);
 	}
 
+	rctx->tran.new_velems = pipe->create_vertex_elements_state(pipe, ve->count, new_velems);
+	pipe->bind_vertex_elements_state(pipe, rctx->tran.new_velems);
+
 	pipe_resource_reference(&out_buffer, NULL);
 }
 
@@ -178,8 +170,11 @@ void r600_end_vertex_translate(struct r600_pipe_context *rctx)
 	struct pipe_context *pipe = &rctx->context;
 
 	/* Restore vertex elements. */
-	pipe->bind_vertex_elements_state(pipe, rctx->tran.saved_velems);
+	if (rctx->vertex_elements == rctx->tran.new_velems) {
+		pipe->bind_vertex_elements_state(pipe, NULL);
+	}
 	pipe->delete_vertex_elements_state(pipe, rctx->tran.new_velems);
+	rctx->tran.new_velems = NULL;
 
 	/* Delete the now-unused VBO. */
 	pipe_resource_reference(&rctx->vertex_buffer[rctx->tran.vb_slot].buffer,




More information about the mesa-commit mailing list