[Mesa-dev] [PATCH] radeonsi: support ARB_compute_variable_group_size

Edward O'Callaghan funfunctor at folklore1984.net
Sat Oct 8 07:03:25 UTC 2016


Reviewed-by: Edward O'Callaghan <funfunctor at folklore1984.net>

On 10/08/2016 04:05 AM, Nicolai Hähnle wrote:
> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
> 
> Not sure if it's possible to avoid programming the block size twice (once for
> the userdata and once for the dispatch).
> ---
>  docs/features.txt                             |  2 +-
>  docs/relnotes/12.1.0.html                     |  2 +-
>  src/gallium/drivers/radeon/r600_pipe_common.c | 10 +++++-
>  src/gallium/drivers/radeon/r600_pipe_common.h |  2 ++
>  src/gallium/drivers/radeonsi/si_compute.c     | 10 +++++-
>  src/gallium/drivers/radeonsi/si_shader.c      | 44 ++++++++++++++++++---------
>  src/gallium/drivers/radeonsi/si_shader.h      |  4 ++-
>  7 files changed, 55 insertions(+), 19 deletions(-)
> 
> diff --git a/docs/features.txt b/docs/features.txt
> index 08b5892..8917a2e 100644
> --- a/docs/features.txt
> +++ b/docs/features.txt
> @@ -272,21 +272,21 @@ GLES3.2, GLSL ES 3.2 -- all DONE: i965/gen9+
>    GL_OES_texture_border_clamp                           DONE (all drivers)
>    GL_OES_texture_buffer                                 DONE (i965, nvc0, radeonsi)
>    GL_OES_texture_cube_map_array                         DONE (i965/gen8+, nvc0, radeonsi)
>    GL_OES_texture_stencil8                               DONE (all drivers that support GL_ARB_texture_stencil8)
>    GL_OES_texture_storage_multisample_2d_array           DONE (all drivers that support GL_ARB_texture_multisample)
>  
>  Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES version:
>  
>    GL_ARB_bindless_texture                               started (airlied)
>    GL_ARB_cl_event                                       not started
> -  GL_ARB_compute_variable_group_size                    DONE (nvc0)
> +  GL_ARB_compute_variable_group_size                    DONE (nvc0, radeonsi)
>    GL_ARB_ES3_2_compatibility                            DONE (i965/gen8+)
>    GL_ARB_fragment_shader_interlock                      not started
>    GL_ARB_gl_spirv                                       not started
>    GL_ARB_gpu_shader_int64                               started (airlied for core and Gallium, idr for i965)
>    GL_ARB_indirect_parameters                            DONE (nvc0, radeonsi)
>    GL_ARB_parallel_shader_compile                        not started, but Chia-I Wu did some related work in 2014
>    GL_ARB_pipeline_statistics_query                      DONE (i965, nvc0, radeonsi, softpipe, swr)
>    GL_ARB_post_depth_coverage                            not started
>    GL_ARB_robustness_isolation                           not started
>    GL_ARB_sample_locations                               not started
> diff --git a/docs/relnotes/12.1.0.html b/docs/relnotes/12.1.0.html
> index 43af1a5..20fd2cb 100644
> --- a/docs/relnotes/12.1.0.html
> +++ b/docs/relnotes/12.1.0.html
> @@ -42,21 +42,21 @@ TBD.
>  <p>
>  Note: some of the new features are only available with certain drivers.
>  </p>
>  
>  <ul>
>  <li>OpenGL ES 3.1 on i965/hsw</li>
>  <li>OpenGL ES 3.2 on i965/gen9+ (Skylake and later)</li>
>  <li>GL_ARB_ES3_1_compatibility on i965</li>
>  <li>GL_ARB_ES3_2_compatibility on i965/gen8+</li>
>  <li>GL_ARB_clear_texture on r600, radeonsi</li>
> -<li>GL_ARB_compute_variable_group_size on nvc0</li>
> +<li>GL_ARB_compute_variable_group_size on nvc0, radeonsi</li>
>  <li>GL_ARB_cull_distance on radeonsi</li>
>  <li>GL_ARB_enhanced_layouts on i965, radeonsi, llvmpipe, softpipe</li>
>  <li>GL_ARB_indirect_parameters on radeonsi</li>
>  <li>GL_ARB_query_buffer_object on radeonsi</li>
>  <li>GL_ARB_shader_draw_parameters on radeonsi</li>
>  <li>GL_ARB_shader_group_vote on nvc0</li>
>  <li>GL_ARB_shader_viewport_layer_array on i965/gen6+</li>
>  <li>GL_ARB_stencil_texturing on i965/hsw</li>
>  <li>GL_ARB_texture_stencil8 on i965/hsw</li>
>  <li>GL_EXT_window_rectangles on nv50, nvc0</li>
> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
> index 44863ee..3dbcbc6 100644
> --- a/src/gallium/drivers/radeon/r600_pipe_common.c
> +++ b/src/gallium/drivers/radeon/r600_pipe_common.c
> @@ -1030,21 +1030,29 @@ static int r600_get_compute_param(struct pipe_screen *screen,
>  		return sizeof(uint32_t);
>  	case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
>  		break; /* unused */
>  	case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
>  		if (ret) {
>  			uint32_t *subgroup_size = ret;
>  			*subgroup_size = r600_wavefront_size(rscreen->family);
>  		}
>  		return sizeof(uint32_t);
>  	case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
> -		return 0;
> +		if (ret) {
> +			uint64_t *max_variable_threads_per_block = ret;
> +			if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 &&
> +			    ir_type == PIPE_SHADER_IR_TGSI)
> +				*max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
> +			else
> +				*max_variable_threads_per_block = 0;
> +		}
> +		return sizeof(uint64_t);
>  	}
>  
>          fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
>          return 0;
>  }
>  
>  static uint64_t r600_get_timestamp(struct pipe_screen *screen)
>  {
>  	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
>  
> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
> index 54991e8..290b228 100644
> --- a/src/gallium/drivers/radeon/r600_pipe_common.h
> +++ b/src/gallium/drivers/radeon/r600_pipe_common.h
> @@ -99,20 +99,22 @@
>  #define DBG_NO_RB_PLUS		(1llu << 45)
>  #define DBG_SI_SCHED		(1llu << 46)
>  #define DBG_MONOLITHIC_SHADERS	(1llu << 47)
>  #define DBG_NO_CE		(1llu << 48)
>  #define DBG_UNSAFE_MATH		(1llu << 49)
>  #define DBG_NO_DCC_FB		(1llu << 50)
>  
>  #define R600_MAP_BUFFER_ALIGNMENT 64
>  #define R600_MAX_VIEWPORTS        16
>  
> +#define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024
> +
>  enum r600_coherency {
>  	R600_COHERENCY_NONE, /* no cache flushes needed */
>  	R600_COHERENCY_SHADER,
>  	R600_COHERENCY_CB_META,
>  };
>  
>  #ifdef PIPE_ARCH_BIG_ENDIAN
>  #define R600_BIG_ENDIAN 1
>  #else
>  #define R600_BIG_ENDIAN 0
> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
> index 1d1df2f..e59bafe 100644
> --- a/src/gallium/drivers/radeonsi/si_compute.c
> +++ b/src/gallium/drivers/radeonsi/si_compute.c
> @@ -594,25 +594,33 @@ static void si_setup_tgsi_grid(struct si_context *sctx,
>  		for (i = 0; i < 3; ++i) {
>  			radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
>  			radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
>  					COPY_DATA_DST_SEL(COPY_DATA_REG));
>  			radeon_emit(cs, (va +  4 * i));
>  			radeon_emit(cs, (va + 4 * i) >> 32);
>  			radeon_emit(cs, (grid_size_reg >> 2) + i);
>  			radeon_emit(cs, 0);
>  		}
>  	} else {
> +		struct si_compute *program = sctx->cs_shader_state.program;
> +		bool variable_group_size =
> +			program->shader.selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0;
>  
> -		radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
> +		radeon_set_sh_reg_seq(cs, grid_size_reg, variable_group_size ? 6 : 3);
>  		radeon_emit(cs, info->grid[0]);
>  		radeon_emit(cs, info->grid[1]);
>  		radeon_emit(cs, info->grid[2]);
> +		if (variable_group_size) {
> +			radeon_emit(cs, info->block[0]);
> +			radeon_emit(cs, info->block[1]);
> +			radeon_emit(cs, info->block[2]);
> +		}
>  	}
>  }
>  
>  static void si_emit_dispatch_packets(struct si_context *sctx,
>                                       const struct pipe_grid_info *info)
>  {
>  	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>  	bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
>  	unsigned waves_per_threadgroup =
>  		DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], 64);
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index ff51c8b..49d4121 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -1763,30 +1763,35 @@ static void declare_system_value(
>  
>  	case TGSI_SEMANTIC_GRID_SIZE:
>  		value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
>  		break;
>  
>  	case TGSI_SEMANTIC_BLOCK_SIZE:
>  	{
>  		LLVMValueRef values[3];
>  		unsigned i;
>  		unsigned *properties = ctx->shader->selector->info.properties;
> -		unsigned sizes[3] = {
> -			properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
> -			properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
> -			properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
> -		};
>  
> -		for (i = 0; i < 3; ++i)
> -			values[i] = lp_build_const_int32(gallivm, sizes[i]);
> +		if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
> +			unsigned sizes[3] = {
> +				properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
> +				properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
> +				properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
> +			};
> +
> +			for (i = 0; i < 3; ++i)
> +				values[i] = lp_build_const_int32(gallivm, sizes[i]);
>  
> -		value = lp_build_gather_values(gallivm, values, 3);
> +			value = lp_build_gather_values(gallivm, values, 3);
> +		} else {
> +			value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_SIZE);
> +		}
>  		break;
>  	}
>  
>  	case TGSI_SEMANTIC_BLOCK_ID:
>  		value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
>  		break;
>  
>  	case TGSI_SEMANTIC_THREAD_ID:
>  		value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
>  		break;
> @@ -5673,20 +5678,21 @@ static void create_function(struct si_shader_context *ctx)
>  
>  			for (i = 0; i < num_return_sgprs; i++)
>  				returns[i] = ctx->i32;
>  			for (; i < num_returns; i++)
>  				returns[i] = ctx->f32;
>  		}
>  		break;
>  
>  	case PIPE_SHADER_COMPUTE:
>  		params[SI_PARAM_GRID_SIZE] = v3i32;
> +		params[SI_PARAM_BLOCK_SIZE] = v3i32;
>  		params[SI_PARAM_BLOCK_ID] = v3i32;
>  		last_sgpr = SI_PARAM_BLOCK_ID;
>  
>  		params[SI_PARAM_THREAD_ID] = v3i32;
>  		num_params = SI_PARAM_THREAD_ID + 1;
>  		break;
>  	default:
>  		assert(0 && "unimplemented shader");
>  		return;
>  	}
> @@ -5709,21 +5715,26 @@ static void create_function(struct si_shader_context *ctx)
>  					  S_0286D0_LINEAR_CENTROID_ENA(1) |
>  					  S_0286D0_FRONT_FACE_ENA(1) |
>  					  S_0286D0_POS_FIXED_PT_ENA(1));
>  	} else if (ctx->type == PIPE_SHADER_COMPUTE) {
>  		const unsigned *properties = shader->selector->info.properties;
>  		unsigned max_work_group_size =
>  		               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
>  		               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
>  		               properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
>  
> -		assert(max_work_group_size);
> +		if (!max_work_group_size) {
> +			/* This is a variable group size compute shader,
> +			 * compile it for the maximum possible group size.
> +			 */
> +			max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
> +		}
>  
>  		radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
>  		                          "amdgpu-max-work-group-size",
>  		                          max_work_group_size);
>  	}
>  
>  	shader->info.num_input_sgprs = 0;
>  	shader->info.num_input_vgprs = 0;
>  
>  	for (i = 0; i <= last_sgpr; ++i)
> @@ -6646,25 +6657,30 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
>  
>  	/* Validate SGPR and VGPR usage for compute to detect compiler bugs.
>  	 * LLVM 3.9svn has this bug.
>  	 */
>  	if (sel->type == PIPE_SHADER_COMPUTE) {
>  		unsigned *props = sel->info.properties;
>  		unsigned wave_size = 64;
>  		unsigned max_vgprs = 256;
>  		unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
>  		unsigned max_sgprs_per_wave = 128;
> -		unsigned min_waves_per_cu =
> -			DIV_ROUND_UP(props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
> -				     props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
> -				     props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH],
> -				     wave_size);
> +		unsigned max_block_threads;
> +
> +		if (props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH])
> +			max_block_threads = props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
> +					    props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
> +					    props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
> +		else
> +			max_block_threads = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
> +
> +		unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
>  		unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
>  
>  		max_vgprs = max_vgprs / min_waves_per_simd;
>  		max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
>  
>  		if (shader->config.num_sgprs > max_sgprs ||
>  		    shader->config.num_vgprs > max_vgprs) {
>  			fprintf(stderr, "LLVM failed to compile a shader correctly: "
>  				"SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
>  				shader->config.num_sgprs, shader->config.num_vgprs,
> diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
> index 67cb67d2..f2618ac 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.h
> +++ b/src/gallium/drivers/radeonsi/si_shader.h
> @@ -122,21 +122,22 @@ enum {
>  	/* GS limits */
>  	SI_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
>  	SI_GSCOPY_NUM_USER_SGPR = SI_SGPR_RW_BUFFERS_HI + 1,
>  
>  	/* PS only */
>  	SI_SGPR_ALPHA_REF	= SI_NUM_RESOURCE_SGPRS,
>  	SI_PS_NUM_USER_SGPR,
>  
>  	/* CS only */
>  	SI_SGPR_GRID_SIZE = SI_NUM_RESOURCE_SGPRS,
> -	SI_CS_NUM_USER_SGPR = SI_SGPR_GRID_SIZE + 3
> +	SI_SGPR_BLOCK_SIZE = SI_SGPR_GRID_SIZE + 3,
> +	SI_CS_NUM_USER_SGPR = SI_SGPR_BLOCK_SIZE + 3
>  };
>  
>  /* LLVM function parameter indices */
>  enum {
>  	SI_PARAM_RW_BUFFERS,
>  	SI_PARAM_CONST_BUFFERS,
>  	SI_PARAM_SAMPLERS,
>  	SI_PARAM_IMAGES,
>  	SI_PARAM_SHADER_BUFFERS,
>  	SI_NUM_RESOURCE_PARAMS,
> @@ -212,20 +213,21 @@ enum {
>  	SI_PARAM_POS_Y_FLOAT,
>  	SI_PARAM_POS_Z_FLOAT,
>  	SI_PARAM_POS_W_FLOAT,
>  	SI_PARAM_FRONT_FACE,
>  	SI_PARAM_ANCILLARY,
>  	SI_PARAM_SAMPLE_COVERAGE,
>  	SI_PARAM_POS_FIXED_PT,
>  
>  	/* CS only parameters */
>  	SI_PARAM_GRID_SIZE = SI_NUM_RESOURCE_PARAMS,
> +	SI_PARAM_BLOCK_SIZE,
>  	SI_PARAM_BLOCK_ID,
>  	SI_PARAM_THREAD_ID,
>  
>  	SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
>  };
>  
>  /* SI-specific system values. */
>  enum {
>  	TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI = TGSI_SEMANTIC_COUNT,
>  	TGSI_SEMANTIC_DEFAULT_TESSINNER_SI,
> 

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 819 bytes
Desc: OpenPGP digital signature
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20161008/affc9958/attachment-0001.sig>


More information about the mesa-dev mailing list