[Mesa-dev] [PATCH 04/10] radeonsi: use ac_shader_config
Marek Olšák
maraeo at gmail.com
Wed May 8 05:52:12 UTC 2019
On Fri, May 3, 2019 at 7:19 AM Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>
> ---
> src/amd/common/ac_binary.c | 2 +
> src/gallium/drivers/radeonsi/si_compute.c | 14 +--
> src/gallium/drivers/radeonsi/si_shader.c | 112 +++-------------------
> src/gallium/drivers/radeonsi/si_shader.h | 25 +----
> 4 files changed, 27 insertions(+), 126 deletions(-)
>
> diff --git a/src/amd/common/ac_binary.c b/src/amd/common/ac_binary.c
> index 44251886b5f..d0ca55e0e0d 100644
> --- a/src/amd/common/ac_binary.c
> +++ b/src/amd/common/ac_binary.c
> @@ -218,26 +218,28 @@ void ac_parse_shader_binary_config(const char *data,
> size_t nbytes,
> unsigned value = util_le32_to_cpu(*(uint32_t*)(data + i +
> 4));
> switch (reg) {
> case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
> case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
> case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
> case R_00B848_COMPUTE_PGM_RSRC1:
> case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
> conf->num_sgprs = MAX2(conf->num_sgprs,
> (G_00B028_SGPRS(value) + 1) * 8);
> conf->num_vgprs = MAX2(conf->num_vgprs,
> (G_00B028_VGPRS(value) + 1) * 4);
> conf->float_mode = G_00B028_FLOAT_MODE(value);
> + conf->rsrc1 = value;
> break;
> case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
> conf->lds_size = MAX2(conf->lds_size,
> G_00B02C_EXTRA_LDS_SIZE(value));
> break;
> case R_00B84C_COMPUTE_PGM_RSRC2:
> conf->lds_size = MAX2(conf->lds_size,
> G_00B84C_LDS_SIZE(value));
> + conf->rsrc2 = value;
> break;
> case R_0286CC_SPI_PS_INPUT_ENA:
> conf->spi_ps_input_ena = value;
> break;
> case R_0286D0_SPI_PS_INPUT_ADDR:
> conf->spi_ps_input_addr = value;
> break;
> case R_0286E8_SPI_TMPRING_SIZE:
> case R_00B860_COMPUTE_TMPRING_SIZE:
> /* WAVESIZE is in units of 256 dwords. */
> diff --git a/src/gallium/drivers/radeonsi/si_compute.c
> b/src/gallium/drivers/radeonsi/si_compute.c
> index 541d7e6f118..02d7bac406a 100644
> --- a/src/gallium/drivers/radeonsi/si_compute.c
> +++ b/src/gallium/drivers/radeonsi/si_compute.c
> @@ -59,21 +59,21 @@ static const amd_kernel_code_t
> *si_compute_get_code_object(
> uint64_t symbol_offset)
> {
> if (!program->use_code_object_v2) {
> return NULL;
> }
> return (const amd_kernel_code_t*)
> (program->shader.binary.code + symbol_offset);
> }
>
> static void code_object_to_config(const amd_kernel_code_t *code_object,
> - struct si_shader_config *out_config) {
> + struct ac_shader_config *out_config) {
>
> uint32_t rsrc1 = code_object->compute_pgm_resource_registers;
> uint32_t rsrc2 = code_object->compute_pgm_resource_registers >> 32;
> out_config->num_sgprs = code_object->wavefront_sgpr_count;
> out_config->num_vgprs = code_object->workitem_vgpr_count;
> out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1);
> out_config->rsrc1 = rsrc1;
> out_config->lds_size = MAX2(out_config->lds_size,
> G_00B84C_LDS_SIZE(rsrc2));
> out_config->rsrc2 = rsrc2;
> out_config->scratch_bytes_per_wave =
> @@ -241,22 +241,22 @@ static void *si_create_compute_state(
> const amd_kernel_code_t *code_object =
> si_compute_get_code_object(program, 0);
> code_object_to_config(code_object,
> &program->shader.config);
> if (program->shader.binary.reloc_count != 0) {
> fprintf(stderr, "Error: %d unsupported
> relocations\n",
>
> program->shader.binary.reloc_count);
> FREE(program);
> return NULL;
> }
> } else {
> -
> si_shader_binary_read_config(&program->shader.binary,
> - &program->shader.config, 0);
> +
> ac_shader_binary_read_config(&program->shader.binary,
> + &program->shader.config, 0, false);
> }
> si_shader_dump(sctx->screen, &program->shader,
> &sctx->debug,
> PIPE_SHADER_COMPUTE, stderr, true);
> if (si_shader_binary_upload(sctx->screen,
> &program->shader) < 0) {
> fprintf(stderr, "LLVM failed to upload shader\n");
> FREE(program);
> return NULL;
> }
> }
>
> @@ -362,21 +362,21 @@ static void si_initialize_compute(struct si_context
> *sctx)
> bc_va >> 8);
> }
> }
>
> sctx->cs_shader_state.emitted_program = NULL;
> sctx->cs_shader_state.initialized = true;
> }
>
> static bool si_setup_compute_scratch_buffer(struct si_context *sctx,
> struct si_shader *shader,
> - struct si_shader_config
> *config)
> + struct ac_shader_config
> *config)
> {
> uint64_t scratch_bo_size, scratch_needed;
> scratch_bo_size = 0;
> scratch_needed = config->scratch_bytes_per_wave *
> sctx->scratch_waves;
> if (sctx->compute_scratch_buffer)
> scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0;
>
> if (scratch_bo_size < scratch_needed) {
> si_resource_reference(&sctx->compute_scratch_buffer, NULL);
>
> @@ -405,38 +405,38 @@ static bool si_setup_compute_scratch_buffer(struct
> si_context *sctx,
> return true;
> }
>
> static bool si_switch_compute_shader(struct si_context *sctx,
> struct si_compute *program,
> struct si_shader *shader,
> const amd_kernel_code_t *code_object,
> unsigned offset)
> {
> struct radeon_cmdbuf *cs = sctx->gfx_cs;
> - struct si_shader_config inline_config = {0};
> - struct si_shader_config *config;
> + struct ac_shader_config inline_config = {0};
> + struct ac_shader_config *config;
> uint64_t shader_va;
>
> if (sctx->cs_shader_state.emitted_program == program &&
> sctx->cs_shader_state.offset == offset)
> return true;
>
> if (program->ir_type != PIPE_SHADER_IR_NATIVE) {
> config = &shader->config;
> } else {
> unsigned lds_blocks;
>
> config = &inline_config;
> if (code_object) {
> code_object_to_config(code_object, config);
> } else {
> - si_shader_binary_read_config(&shader->binary,
> config, offset);
> + ac_shader_binary_read_config(&shader->binary,
> config, offset, false);
> }
>
> lds_blocks = config->lds_size;
> /* XXX: We are over allocating LDS. For SI, the shader
> reports
> * LDS in blocks of 256 bytes, so if there are 4 bytes lds
> * allocated in the shader and 4 bytes allocated by the
> state
> * tracker, then we will set LDS_SIZE to 512 bytes rather
> than 256.
> */
> if (sctx->chip_class <= SI) {
> lds_blocks += align(program->local_size, 256) >> 8;
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c
> b/src/gallium/drivers/radeonsi/si_shader.c
> index f6d882cf583..da43447013d 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -4962,104 +4962,20 @@ static void si_llvm_emit_polygon_stipple(struct
> si_shader_context *ctx,
> /* The stipple pattern is 32x32, each row has 32 bits. */
> offset = LLVMBuildMul(builder, address[1],
> LLVMConstInt(ctx->i32, 4, 0), "");
> row = buffer_load_const(ctx, desc, offset);
> row = ac_to_integer(&ctx->ac, row);
> bit = LLVMBuildLShr(builder, row, address[0], "");
> bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
> ac_build_kill_if_false(&ctx->ac, bit);
> }
>
> -void si_shader_binary_read_config(struct ac_shader_binary *binary,
> - struct si_shader_config *conf,
> - unsigned symbol_offset)
> -{
> - unsigned i;
> - const unsigned char *config =
> - ac_shader_binary_config_start(binary, symbol_offset);
> - bool really_needs_scratch = false;
> -
> - /* LLVM adds SGPR spills to the scratch size.
> - * Find out if we really need the scratch buffer.
> - */
> - for (i = 0; i < binary->reloc_count; i++) {
> - const struct ac_shader_reloc *reloc = &binary->relocs[i];
> -
> - if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
> - !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
> - really_needs_scratch = true;
> - break;
> - }
> - }
> -
> - /* XXX: We may be able to emit some of these values directly
> rather than
> - * extracting fields to be emitted later.
> - */
> -
> - for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
> - unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
> - unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i
> + 4));
> - switch (reg) {
> - case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
> - case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
> - case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
> - case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
> - case R_00B848_COMPUTE_PGM_RSRC1:
> - conf->num_sgprs = MAX2(conf->num_sgprs,
> (G_00B028_SGPRS(value) + 1) * 8);
> - conf->num_vgprs = MAX2(conf->num_vgprs,
> (G_00B028_VGPRS(value) + 1) * 4);
> - conf->float_mode = G_00B028_FLOAT_MODE(value);
> - conf->rsrc1 = value;
> - break;
> - case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
> - conf->lds_size = MAX2(conf->lds_size,
> G_00B02C_EXTRA_LDS_SIZE(value));
> - break;
> - case R_00B84C_COMPUTE_PGM_RSRC2:
> - conf->lds_size = MAX2(conf->lds_size,
> G_00B84C_LDS_SIZE(value));
> - conf->rsrc2 = value;
> - break;
> - case R_0286CC_SPI_PS_INPUT_ENA:
> - conf->spi_ps_input_ena = value;
> - break;
> - case R_0286D0_SPI_PS_INPUT_ADDR:
> - conf->spi_ps_input_addr = value;
> - break;
> - case R_0286E8_SPI_TMPRING_SIZE:
> - case R_00B860_COMPUTE_TMPRING_SIZE:
> - /* WAVESIZE is in units of 256 dwords. */
> - if (really_needs_scratch)
> - conf->scratch_bytes_per_wave =
> - G_00B860_WAVESIZE(value) * 256 * 4;
> - break;
> - case 0x4: /* SPILLED_SGPRS */
> - conf->spilled_sgprs = value;
> - break;
> - case 0x8: /* SPILLED_VGPRS */
> - conf->spilled_vgprs = value;
> - break;
> - default:
> - {
> - static bool printed;
> -
> - if (!printed) {
> - fprintf(stderr, "Warning: LLVM
> emitted unknown "
> - "config register: 0x%x\n",
> reg);
> - printed = true;
> - }
> - }
> - break;
> - }
> - }
> -
> - if (!conf->spi_ps_input_addr)
> - conf->spi_ps_input_addr = conf->spi_ps_input_ena;
> -}
> -
> void si_shader_apply_scratch_relocs(struct si_shader *shader,
> uint64_t scratch_va)
> {
> unsigned i;
> uint32_t scratch_rsrc_dword0 = scratch_va;
> uint32_t scratch_rsrc_dword1 =
> S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
>
> /* Enable scratch coalescing. */
> scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
> @@ -5213,21 +5129,21 @@ static void si_shader_dump_disassembly(const
> struct ac_shader_binary *binary,
> fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
> binary->code[i + 3], binary->code[i + 2],
> binary->code[i + 1], binary->code[i]);
> }
> }
> }
>
> static void si_calculate_max_simd_waves(struct si_shader *shader)
> {
> struct si_screen *sscreen = shader->selector->screen;
> - struct si_shader_config *conf = &shader->config;
> + struct ac_shader_config *conf = &shader->config;
> unsigned num_inputs = shader->selector->info.num_inputs;
> unsigned lds_increment = sscreen->info.chip_class >= CIK ? 512 :
> 256;
> unsigned lds_per_wave = 0;
> unsigned max_simd_waves;
>
> max_simd_waves = ac_get_max_simd_waves(sscreen->info.family);
>
> /* Compute LDS usage for PS. */
> switch (shader->selector->type) {
> case PIPE_SHADER_FRAGMENT:
> @@ -5262,46 +5178,46 @@ static void si_calculate_max_simd_waves(struct
> si_shader *shader)
> }
>
> if (conf->num_vgprs)
> max_simd_waves = MIN2(max_simd_waves, 256 /
> conf->num_vgprs);
>
> /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage
> above
> * 16KB makes some SIMDs unoccupied). */
> if (lds_per_wave)
> max_simd_waves = MIN2(max_simd_waves, 16384 /
> lds_per_wave);
>
> - conf->max_simd_waves = max_simd_waves;
> + shader->max_simd_waves = max_simd_waves;
> }
>
> void si_shader_dump_stats_for_shader_db(const struct si_shader *shader,
> struct pipe_debug_callback *debug)
> {
> - const struct si_shader_config *conf = &shader->config;
> + const struct ac_shader_config *conf = &shader->config;
>
> pipe_debug_message(debug, SHADER_INFO,
> "Shader Stats: SGPRS: %d VGPRS: %d Code Size:
> %d "
> "LDS: %d Scratch: %d Max Waves: %d Spilled
> SGPRs: %d "
> "Spilled VGPRs: %d PrivMem VGPRs: %d",
> conf->num_sgprs, conf->num_vgprs,
> si_get_shader_binary_size(shader),
> conf->lds_size, conf->scratch_bytes_per_wave,
> - conf->max_simd_waves, conf->spilled_sgprs,
> - conf->spilled_vgprs, conf->private_mem_vgprs);
> + shader->max_simd_waves, conf->spilled_sgprs,
> + conf->spilled_vgprs, shader->private_mem_vgprs);
> }
>
> static void si_shader_dump_stats(struct si_screen *sscreen,
> const struct si_shader *shader,
> unsigned processor,
> FILE *file,
> bool check_debug_option)
> {
> - const struct si_shader_config *conf = &shader->config;
> + const struct ac_shader_config *conf = &shader->config;
>
> if (!check_debug_option ||
> si_can_dump_shader(sscreen, processor)) {
> if (processor == PIPE_SHADER_FRAGMENT) {
> fprintf(file, "*** SHADER CONFIG ***\n"
> "SPI_PS_INPUT_ADDR = 0x%04x\n"
> "SPI_PS_INPUT_ENA = 0x%04x\n",
> conf->spi_ps_input_addr,
> conf->spi_ps_input_ena);
> }
>
> @@ -5311,24 +5227,24 @@ static void si_shader_dump_stats(struct si_screen
> *sscreen,
> "Spilled SGPRs: %d\n"
> "Spilled VGPRs: %d\n"
> "Private memory VGPRs: %d\n"
> "Code Size: %d bytes\n"
> "LDS: %d blocks\n"
> "Scratch: %d bytes per wave\n"
> "Max Waves: %d\n"
> "********************\n\n\n",
> conf->num_sgprs, conf->num_vgprs,
> conf->spilled_sgprs, conf->spilled_vgprs,
> - conf->private_mem_vgprs,
> + shader->private_mem_vgprs,
> si_get_shader_binary_size(shader),
> conf->lds_size, conf->scratch_bytes_per_wave,
> - conf->max_simd_waves);
> + shader->max_simd_waves);
> }
> }
>
> const char *si_get_shader_name(const struct si_shader *shader, unsigned
> processor)
> {
> switch (processor) {
> case PIPE_SHADER_VERTEX:
> if (shader->key.as_es)
> return "Vertex Shader as ES";
> else if (shader->key.as_ls)
> @@ -5399,21 +5315,21 @@ void si_shader_dump(struct si_screen *sscreen,
> const struct si_shader *shader,
> debug, "epilog", file);
> fprintf(file, "\n");
> }
>
> si_shader_dump_stats(sscreen, shader, processor, file,
> check_debug_option);
> }
>
> static int si_compile_llvm(struct si_screen *sscreen,
> struct ac_shader_binary *binary,
> - struct si_shader_config *conf,
> + struct ac_shader_config *conf,
> struct ac_llvm_compiler *compiler,
> LLVMModuleRef mod,
> struct pipe_debug_callback *debug,
> unsigned processor,
> const char *name,
> bool less_optimized)
> {
> int r = 0;
> unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
>
> @@ -5433,21 +5349,21 @@ static int si_compile_llvm(struct si_screen
> *sscreen,
> LLVMDisposeMessage(ir);
> }
>
> if (!si_replace_shader(count, binary)) {
> r = si_llvm_compile(mod, binary, compiler, debug,
> less_optimized);
> if (r)
> return r;
> }
>
> - si_shader_binary_read_config(binary, conf, 0);
> + ac_shader_binary_read_config(binary, conf, 0, false);
>
> /* Enable 64-bit and 16-bit denormals, because there is no
> performance
> * cost.
> *
> * If denormals are enabled, all floating-point output modifiers
> are
> * ignored.
> *
> * Don't enable denormals for 32-bit floats, because:
> * - Floating-point output modifiers would be ignored by the hw.
> * - Some opcodes don't support denormals, such as v_mad_f32. We
> would
> @@ -6799,21 +6715,21 @@ int si_compile_tgsi_shader(struct si_screen
> *sscreen,
> need_prolog ? 1 : 0, 0);
> }
>
> si_llvm_optimize_module(&ctx);
>
> /* Post-optimization transformations and analysis. */
> si_optimize_vs_outputs(&ctx);
>
> if ((debug && debug->debug_message) ||
> si_can_dump_shader(sscreen, ctx.type)) {
> - ctx.shader->config.private_mem_vgprs =
> + ctx.shader->private_mem_vgprs =
> ac_count_scratch_private_memory(ctx.main_fn);
> }
>
> /* Make sure the input is a pointer and not integer followed by
> inttoptr. */
> assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) ==
> LLVMPointerTypeKind);
>
> /* Compile to bytecode. */
> r = si_compile_llvm(sscreen, &shader->binary, &shader->config,
> compiler,
> ctx.ac.module, debug, ctx.type,
> @@ -7954,23 +7870,23 @@ int si_shader_create(struct si_screen *sscreen,
> struct ac_llvm_compiler *compile
> shader->config.num_sgprs =
> MAX2(shader->config.num_sgprs,
>
> shader->previous_stage->config.num_sgprs);
> shader->config.num_vgprs =
> MAX2(shader->config.num_vgprs,
>
> shader->previous_stage->config.num_vgprs);
> shader->config.spilled_sgprs =
> MAX2(shader->config.spilled_sgprs,
>
> shader->previous_stage->config.spilled_sgprs);
> shader->config.spilled_vgprs =
> MAX2(shader->config.spilled_vgprs,
>
> shader->previous_stage->config.spilled_vgprs);
> - shader->config.private_mem_vgprs =
> - MAX2(shader->config.private_mem_vgprs,
> -
> shader->previous_stage->config.private_mem_vgprs);
> + shader->private_mem_vgprs =
> + MAX2(shader->private_mem_vgprs,
> +
> shader->previous_stage->private_mem_vgprs);
> shader->config.scratch_bytes_per_wave =
> MAX2(shader->config.scratch_bytes_per_wave,
>
> shader->previous_stage->config.scratch_bytes_per_wave);
> shader->info.uses_instanceid |=
>
> shader->previous_stage->info.uses_instanceid;
> }
> if (shader->prolog2) {
> shader->config.num_sgprs =
> MAX2(shader->config.num_sgprs,
>
> shader->prolog2->config.num_sgprs);
> shader->config.num_vgprs =
> MAX2(shader->config.num_vgprs,
> diff --git a/src/gallium/drivers/radeonsi/si_shader.h
> b/src/gallium/drivers/radeonsi/si_shader.h
> index ecf7f8bbd7a..6c8f70dc94b 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.h
> +++ b/src/gallium/drivers/radeonsi/si_shader.h
> @@ -552,36 +552,20 @@ struct si_shader_key {
> * but forces monolithic shaders to be used as soon as
> * possible, because it's in the "opt" group.
> */
> unsigned prefer_mono:1;
> } opt;
> };
>
> /* Restore the pack alignment to default. */
> #pragma pack(pop)
>
> -struct si_shader_config {
> - unsigned num_sgprs;
> - unsigned num_vgprs;
> - unsigned spilled_sgprs;
> - unsigned spilled_vgprs;
> - unsigned private_mem_vgprs;
> - unsigned lds_size;
> - unsigned max_simd_waves;
> - unsigned spi_ps_input_ena;
> - unsigned spi_ps_input_addr;
> - unsigned float_mode;
> - unsigned scratch_bytes_per_wave;
> - unsigned rsrc1;
> - unsigned rsrc2;
> -};
> -
> /* GCN-specific shader info. */
> struct si_shader_info {
> ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS];
> ubyte num_input_sgprs;
> ubyte num_input_vgprs;
> signed char face_vgpr_index;
> signed char ancillary_vgpr_index;
> bool uses_instanceid;
> ubyte nr_pos_exports;
> ubyte nr_param_exports;
> @@ -605,22 +589,24 @@ struct si_shader {
> struct si_shader_key key;
> struct util_queue_fence ready;
> bool compilation_failed;
> bool is_monolithic;
> bool is_optimized;
> bool is_binary_shared;
> bool is_gs_copy_shader;
>
> /* The following data is all that's needed for binary shaders. */
> struct ac_shader_binary binary;
> - struct si_shader_config config;
> + struct ac_shader_config config;
> struct si_shader_info info;
> + unsigned private_mem_vgprs;
> + unsigned max_simd_waves;
>
The shader cache stores "config" but not these new members.
Marek
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20190508/2ca94e28/attachment-0001.html>
More information about the mesa-dev
mailing list