[Mesa-dev] [PATCH 09/10] radeonsi/compute: Enable PIPE_SHADER_IR_NATIVE for compute shaders

Mon Oct 6 12:44:31 PDT 2014

---
 src/gallium/drivers/radeonsi/si_compute.c |  51 +++++----------
 src/gallium/drivers/radeonsi/si_pipe.c    |   2 +-
 src/gallium/drivers/radeonsi/si_shader.c  | 104 ++++++++++++++++++------------
 src/gallium/drivers/radeonsi/si_shader.h  |   7 ++
 4 files changed, 88 insertions(+), 76 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 490845b..a133380 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -23,14 +23,14 @@
  */
 
 #include "util/u_memory.h"
+#include "radeon/r600_pipe_common.h"
+#include "radeon/radeon_elf_util.h"
 
 #include "radeon/r600_cs.h"
 #include "si_pipe.h"
 #include "si_shader.h"
 #include "sid.h"
 
-#include "radeon/radeon_llvm_util.h"
-
 #define MAX_GLOBAL_BUFFERS 20
 #define NUM_USER_SGPRS 4
 
@@ -40,14 +40,12 @@ struct si_compute {
 	unsigned local_size;
 	unsigned private_size;
 	unsigned input_size;
-	unsigned num_kernels;
-	struct si_shader *kernels;
+	struct radeon_shader_binary binary;
+	struct si_shader program;
 	unsigned num_user_sgprs;
 
 	struct r600_resource *input_buffer;
 	struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
-
-	LLVMContextRef llvm_ctx;
 };
 
 static void *si_create_compute_state(
@@ -57,10 +55,7 @@ static void *si_create_compute_state(
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_compute *program = CALLOC_STRUCT(si_compute);
 	const struct pipe_llvm_program_header *header;
-	const unsigned char *code;
-	unsigned i;
-
-	program->llvm_ctx = LLVMContextCreate();
+	const char *code;
 
 	header = cso->prog;
 	code = cso->prog + sizeof(struct pipe_llvm_program_header);
@@ -70,16 +65,9 @@ static void *si_create_compute_state(
 	program->private_size = cso->req_private_mem;
 	program->input_size = cso->req_input_mem;
 
-	program->num_kernels = radeon_llvm_get_num_kernels(program->llvm_ctx, code,
-							header->num_bytes);
-	program->kernels = CALLOC(sizeof(struct si_shader),
-							program->num_kernels);
-	for (i = 0; i < program->num_kernels; i++) {
-		LLVMModuleRef mod = radeon_llvm_get_kernel_module(program->llvm_ctx, i,
-							code, header->num_bytes);
-		si_compile_llvm(sctx->screen, &program->kernels[i], mod);
-		LLVMDisposeModule(mod);
-	}
+	memset(&program->binary, 0, sizeof(program->binary));
+	radeon_elf_read(code, header->num_bytes, &program->binary, true);
+	si_create_shader(sctx->screen, &program->program, &program->binary);
 
 	program->input_buffer =	si_resource_create_custom(sctx->b.b.screen,
 		PIPE_USAGE_IMMUTABLE, program->input_size);
@@ -177,7 +165,7 @@ static void si_launch_grid(
 	uint64_t shader_va;
 	unsigned arg_user_sgpr_count = NUM_USER_SGPRS;
 	unsigned i;
-	struct si_shader *shader = &program->kernels[pc];
+	struct si_shader *shader = &program->program;
 	unsigned lds_blocks;
 	unsigned num_waves_for_scratch;
 
@@ -194,6 +182,9 @@ static void si_launch_grid(
 
 	pm4->compute_pkt = true;
 
+	/* Read the config informatio */
+	si_shader_binary_read_config(&program->binary, &program->program, pc);
+
 	/* Upload the kernel arguments */
 
 	/* The extra num_work_size_bytes are for work group / work item size information */
@@ -285,7 +276,7 @@ static void si_launch_grid(
 						0x190 /* Default value */);
 	}
 
-	shader_va = shader->bo->gpu_address;
+	shader_va = shader->bo->gpu_address + pc;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
 	si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, (shader_va >> 8) & 0xffffffff);
 	si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40);
@@ -384,22 +375,12 @@ static void si_delete_compute_state(struct pipe_context *ctx, void* state){
 		return;
 	}
 
-	if (program->kernels) {
-		for (int i = 0; i < program->num_kernels; i++){
-			if (program->kernels[i].bo){
-				si_shader_destroy(ctx, &program->kernels[i]);
-			}
-		}
-		FREE(program->kernels);
-	}
-
-	if (program->llvm_ctx){
-		LLVMContextDispose(program->llvm_ctx);
-	}
 	pipe_resource_reference(
 		(struct pipe_resource **)&program->input_buffer, NULL);
 
-	//And then free the program itself.
+	FREE(program->binary.code);
+	FREE(program->binary.config);
+	FREE(program->binary.rodata);
 	FREE(program);
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 2cce5cc..ad6f518 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -334,7 +334,7 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 	case PIPE_SHADER_COMPUTE:
 		switch (param) {
 		case PIPE_SHADER_CAP_PREFERRED_IR:
-			return PIPE_SHADER_IR_LLVM;
+			return PIPE_SHADER_IR_NATIVE;
 		case PIPE_SHADER_CAP_DOUBLES:
 			return 0; /* XXX: Enable doubles once the compiler can
 			             handle them. */
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 9d2cc80..401da1b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -33,6 +33,7 @@
 #include "gallivm/lp_bld_arit.h"
 #include "gallivm/lp_bld_flow.h"
 #include "radeon/radeon_llvm.h"
+#include "radeon/radeon_elf_util.h"
 #include "radeon/radeon_llvm_emit.h"
 #include "util/u_memory.h"
 #include "tgsi/tgsi_parse.h"
@@ -2625,52 +2626,34 @@ static void preload_streamout_buffers(struct si_shader_context *si_shader_ctx)
 	}
 }
 
-int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
-		    LLVMModuleRef mod)
+void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
+				struct si_shader *shader,
+				unsigned symbol_offset)
 {
-	unsigned r; /* llvm_compile result */
 	unsigned i;
-	unsigned char *ptr;
-	struct radeon_shader_binary binary;
-	bool dump = r600_can_dump_shader(&sscreen->b,
-			shader->selector ? shader->selector->tokens : NULL);
-	const char * gpu_family = r600_get_llvm_processor_name(sscreen->b.family);
-	unsigned code_size;
-
-	/* Use LLVM to compile shader */
-	memset(&binary, 0, sizeof(binary));
-	r = radeon_llvm_compile(mod, &binary, gpu_family, dump);
-
-	/* Output binary dump if rscreen->debug_flags are set */
-	if (dump && ! binary.disassembled) {
-		fprintf(stderr, "SI CODE:\n");
-		for (i = 0; i < binary.code_size; i+=4 ) {
-			fprintf(stderr, "%02x%02x%02x%02x\n", binary.code[i + 3],
-				binary.code[i + 2], binary.code[i + 1],
-				binary.code[i]);
-		}
-	}
+	const unsigned char *config =
+		radeon_shader_binary_config_start(binary, symbol_offset);
 
 	/* XXX: We may be able to emit some of these values directly rather than
 	 * extracting fields to be emitted later.
 	 */
-	/* Parse config data in compiled binary */
-	for (i = 0; i < binary.config_size; i+= 8) {
-		unsigned reg = util_le32_to_cpu(*(uint32_t*)(binary.config + i));
-		unsigned value = util_le32_to_cpu(*(uint32_t*)(binary.config + i + 4));
+
+	for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
+		unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
+		unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
 		switch (reg) {
 		case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
 		case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
 		case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
 		case R_00B848_COMPUTE_PGM_RSRC1:
-			shader->num_sgprs = (G_00B028_SGPRS(value) + 1) * 8;
-			shader->num_vgprs = (G_00B028_VGPRS(value) + 1) * 4;
+			shader->num_sgprs = MAX2(shader->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
+			shader->num_vgprs = MAX2(shader->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
 			break;
 		case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
-			shader->lds_size = G_00B02C_EXTRA_LDS_SIZE(value);
+			shader->lds_size = MAX2(shader->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
 			break;
 		case R_00B84C_COMPUTE_PGM_RSRC2:
-			shader->lds_size = G_00B84C_LDS_SIZE(value);
+			shader->lds_size = MAX2(shader->lds_size, G_00B84C_LDS_SIZE(value));
 			break;
 		case R_0286CC_SPI_PS_INPUT_ENA:
 			shader->spi_ps_input_ena = value;
@@ -2686,9 +2669,32 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 			break;
 		}
 	}
+}
+
+int si_create_shader(struct si_screen *sscreen,
+		struct si_shader *shader,
+		const struct radeon_shader_binary *binary)
+{
+
+	unsigned i;
+	unsigned code_size;
+	unsigned char *ptr;
+	bool dump  = r600_can_dump_shader(&sscreen->b,
+		shader->selector ? shader->selector->tokens : NULL);
+
+	if (dump && !binary->disassembled) {
+		fprintf(stderr, "SI CODE:\n");
+		for (i = 0; i < binary->code_size; i+=4 ) {
+			fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3],
+				binary->code[i + 2], binary->code[i + 1],
+				binary->code[i]);
+		}
+	}
+
+	si_shader_binary_read_config(binary, shader, 0);
 
 	/* copy new shader */
-	code_size = binary.code_size + binary.rodata_size;
+	code_size = binary->code_size + binary->rodata_size;
 	r600_resource_reference(&shader->bo, NULL);
 	shader->bo = si_resource_create_custom(&sscreen->b.b, PIPE_USAGE_IMMUTABLE,
 					       code_size);
@@ -2696,19 +2702,37 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 		return -ENOMEM;
 	}
 
-	ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL, PIPE_TRANSFER_WRITE);
-	util_memcpy_cpu_to_le32(ptr, binary.code, binary.code_size);
-	if (binary.rodata_size > 0) {
-		ptr += binary.code_size;
-		util_memcpy_cpu_to_le32(ptr, binary.rodata, binary.rodata_size);
+
+	ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL, PIPE_TRANSFER_READ_WRITE);
+	util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size);
+	if (binary->rodata_size > 0) {
+		ptr += binary->code_size;
+		util_memcpy_cpu_to_le32(ptr, binary->rodata, binary->rodata_size);
 	}
 
 	sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
 
-	free(binary.code);
-	free(binary.config);
-	free(binary.rodata);
+	return 0;
+}
+
+int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
+							LLVMModuleRef mod)
+{
+	int r = 0;
+	struct radeon_shader_binary binary;
+	bool dump = r600_can_dump_shader(&sscreen->b,
+			shader->selector ? shader->selector->tokens : NULL);
+	memset(&binary, 0, sizeof(binary));
+	r = radeon_llvm_compile(mod, &binary,
+		r600_get_llvm_processor_name(sscreen->b.family), dump);
 
+	if (r) {
+		return r;
+	}
+	r = si_create_shader(sscreen, shader, &binary);
+	FREE(binary.code);
+	FREE(binary.config);
+	FREE(binary.rodata);
 	return r;
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index d8a63df..c616bc4 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -31,6 +31,8 @@
 
 #include <llvm-c/Core.h> /* LLVMModuleRef */
 
+struct radeon_shader_binary;
+
 #define SI_SGPR_CONST		0
 #define SI_SGPR_SAMPLER		2
 #define SI_SGPR_RESOURCE	4
@@ -204,5 +206,10 @@ int si_shader_create(struct si_screen *sscreen, struct si_shader *shader);
 int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 		    LLVMModuleRef mod);
 void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader);
+int si_create_shader(struct si_screen *sscreen, struct si_shader *shader,
+		const struct radeon_shader_binary *binary);
+void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
+				struct si_shader *shader,
+				unsigned symbol_offset);
 
 #endif
-- 
1.8.5.5