[Mesa-dev] [PATCH 8/8] r600: add support for hw atomic counters. (v3)

Tue Nov 7 17:26:13 UTC 2017

On 07.11.2017 07:31, Dave Airlie wrote:
> From: Dave Airlie <airlied at redhat.com>
> 
> This adds support for the evergreen/cayman atomic counters.
> 
> These are implemented using GDS append/consume counters. The values
> for each counter are loaded before drawing and saved after each draw
> using special CP packets.

I admit I'm a bit confused by this at the hardware level.

My understanding of GDS is that it's mostly another copy of LDS (but 
global), and all GDS instructions are atomic by default. There is extra 
append-consume hardware, but it's main point is to support use cases 
where operations have to be ordered by wave, or where a wave return is 
supposed to be blocked (for producer/consumer kernels and ring buffer 
management).

So this should really work without the append/consume counters as well, 
just with regular GDS memory. Is there a particular reason why you 
haven't done that? I suppose it might require more stuff to manage GDS 
allocations in the kernel, and if it works with this approach...

Acked-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

> 
> v2: move hw atomic assignment into driver.
> v3: fix messing up caps (Gert Wollny), only store ranges in driver,
> drop buffers.
> 
> Signed-off-by: Dave Airlie <airlied at redhat.com>
> ---
>   src/gallium/drivers/r600/evergreen_state.c   | 159 ++++++++++++++++++
>   src/gallium/drivers/r600/r600_pipe.c         |  15 ++
>   src/gallium/drivers/r600/r600_pipe.h         |  22 +++
>   src/gallium/drivers/r600/r600_shader.c       | 239 ++++++++++++++++++++++++---
>   src/gallium/drivers/r600/r600_shader.h       |  19 +++
>   src/gallium/drivers/r600/r600_state_common.c |  46 ++++++
>   src/gallium/drivers/r600/r600d_common.h      |   2 +
>   7 files changed, 480 insertions(+), 22 deletions(-)
> 
> diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
> index 96eb35a..634cd96 100644
> --- a/src/gallium/drivers/r600/evergreen_state.c
> +++ b/src/gallium/drivers/r600/evergreen_state.c
> @@ -3716,6 +3716,38 @@ static void evergreen_set_tess_state(struct pipe_context *ctx,
>   	rctx->tess_state_dirty = true;
>   }
>   
> +static void evergreen_set_hw_atomic_buffers(struct pipe_context *ctx,
> +					    unsigned start_slot,
> +					    unsigned count,
> +					    const struct pipe_shader_buffer *buffers)
> +{
> +	struct r600_context *rctx = (struct r600_context *)ctx;
> +	struct r600_atomic_buffer_state *astate;
> +	int i, idx;
> +
> +	astate = &rctx->atomic_buffer_state;
> +
> +	/* we'd probably like to expand this to 8 later so put the logic in */
> +	for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
> +		const struct pipe_shader_buffer *buf;
> +		struct pipe_shader_buffer *abuf;
> +
> +		abuf = &astate->buffer[i];
> +
> +		if (!buffers || !buffers[idx].buffer) {
> +			pipe_resource_reference(&abuf->buffer, NULL);
> +			astate->enabled_mask &= ~(1 << i);
> +			continue;
> +		}
> +		buf = &buffers[idx];
> +
> +		pipe_resource_reference(&abuf->buffer, buf->buffer);
> +		abuf->buffer_offset = buf->buffer_offset;
> +		abuf->buffer_size = buf->buffer_size;
> +		astate->enabled_mask |= (1 << i);
> +	}
> +}
> +
>   void evergreen_init_state_functions(struct r600_context *rctx)
>   {
>   	unsigned id = 1;
> @@ -3801,6 +3833,7 @@ void evergreen_init_state_functions(struct r600_context *rctx)
>   	rctx->b.b.set_polygon_stipple = evergreen_set_polygon_stipple;
>   	rctx->b.b.set_min_samples = evergreen_set_min_samples;
>   	rctx->b.b.set_tess_state = evergreen_set_tess_state;
> +	rctx->b.b.set_hw_atomic_buffers = evergreen_set_hw_atomic_buffers;
>   	if (rctx->b.chip_class == EVERGREEN)
>                   rctx->b.b.get_sample_position = evergreen_get_sample_position;
>           else
> @@ -4107,3 +4140,129 @@ void eg_trace_emit(struct r600_context *rctx)
>   	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
>   	radeon_emit(cs, AC_ENCODE_TRACE_POINT(rctx->trace_id));
>   }
> +
> +bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
> +					struct r600_shader_atomic *combined_atomics,
> +					uint8_t *atomic_used_mask_p)
> +{
> +	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
> +	struct r600_atomic_buffer_state *astate = &rctx->atomic_buffer_state;
> +	unsigned pkt_flags = 0;
> +	uint8_t atomic_used_mask = 0;
> +	int i, j, k;
> +
> +	for (i = 0; i < EG_NUM_HW_STAGES; i++) {
> +		uint8_t num_atomic_stage;
> +		struct r600_pipe_shader *pshader;
> +
> +		pshader = rctx->hw_shader_stages[i].shader;
> +		if (!pshader)
> +			continue;
> +
> +		num_atomic_stage = pshader->shader.nhwatomic_ranges;
> +		if (!num_atomic_stage)
> +			continue;
> +
> +		for (j = 0; j < num_atomic_stage; j++) {
> +			struct r600_shader_atomic *atomic = &pshader->shader.atomics[j];
> +			int natomics = atomic->end - atomic->start + 1;
> +
> +			for (k = 0; k < natomics; k++) {
> +				/* seen this in a previous stage */
> +				if (atomic_used_mask & (1u << (atomic->hw_idx + k)))
> +					continue;
> +
> +				combined_atomics[atomic->hw_idx + k].hw_idx = atomic->hw_idx + k;
> +				combined_atomics[atomic->hw_idx + k].buffer_id = atomic->buffer_id;
> +				combined_atomics[atomic->hw_idx + k].start = atomic->start + k;
> +				combined_atomics[atomic->hw_idx + k].end = combined_atomics[atomic->hw_idx + k].start + 1;
> +				atomic_used_mask |= (1u << (atomic->hw_idx + k));
> +			}
> +		}
> +	}
> +
> +	uint32_t mask = atomic_used_mask;
> +	while (mask) {
> +		unsigned atomic_index = u_bit_scan(&mask);
> +		struct r600_shader_atomic *atomic = &combined_atomics[atomic_index];
> +		struct r600_resource *resource = r600_resource(astate->buffer[atomic->buffer_id].buffer);
> +		assert(resource);
> +		unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
> +							   resource,
> +							   RADEON_USAGE_READ,
> +							   RADEON_PRIO_SHADER_RW_BUFFER);
> +		uint64_t dst_offset = resource->gpu_address + (atomic->start * 4);
> +		uint32_t base_reg_0 = R_02872C_GDS_APPEND_COUNT_0;
> +
> +		uint32_t reg_val = (base_reg_0 + atomic->hw_idx * 4 - EVERGREEN_CONTEXT_REG_OFFSET) >> 2;
> +
> +		radeon_emit(cs, PKT3(PKT3_SET_APPEND_CNT, 2, 0) | pkt_flags);
> +		radeon_emit(cs, (reg_val << 16) | 0x3);
> +		radeon_emit(cs, dst_offset & 0xfffffffc);
> +		radeon_emit(cs, (dst_offset >> 32) & 0xff);
> +		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
> +		radeon_emit(cs, reloc);
> +	}
> +	*atomic_used_mask_p = atomic_used_mask;
> +	return true;
> +}
> +
> +void evergreen_emit_atomic_buffer_save(struct r600_context *rctx,
> +				       struct r600_shader_atomic *combined_atomics,
> +				       uint8_t *atomic_used_mask_p)
> +{
> +	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
> +	struct r600_atomic_buffer_state *astate = &rctx->atomic_buffer_state;
> +	uint32_t pkt_flags = 0;
> +	uint32_t event = EVENT_TYPE_PS_DONE;
> +	uint32_t mask = astate->enabled_mask;
> +	uint64_t dst_offset;
> +	unsigned reloc;
> +
> +	mask = *atomic_used_mask_p;
> +	while (mask) {
> +		unsigned atomic_index = u_bit_scan(&mask);
> +		struct r600_shader_atomic *atomic = &combined_atomics[atomic_index];
> +		struct r600_resource *resource = r600_resource(astate->buffer[atomic->buffer_id].buffer);
> +		assert(resource);
> +
> +		uint32_t base_reg_0 = R_02872C_GDS_APPEND_COUNT_0;
> +		reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
> +							   resource,
> +							   RADEON_USAGE_WRITE,
> +							   RADEON_PRIO_SHADER_RW_BUFFER);
> +		dst_offset = resource->gpu_address + (atomic->start * 4);
> +		uint32_t reg_val = (base_reg_0 + atomic->hw_idx * 4) >> 2;
> +
> +		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOS, 3, 0) | pkt_flags);
> +		radeon_emit(cs, EVENT_TYPE(event) | EVENT_INDEX(6));
> +		radeon_emit(cs, (dst_offset) & 0xffffffff);
> +		radeon_emit(cs, (0 << 29) | ((dst_offset >> 32) & 0xff));
> +		radeon_emit(cs, reg_val);
> +		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
> +		radeon_emit(cs, reloc);
> +	}
> +	++rctx->append_fence_id;
> +	reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
> +					  r600_resource(rctx->append_fence),
> +					  RADEON_USAGE_READWRITE,
> +					  RADEON_PRIO_SHADER_RW_BUFFER);
> +	dst_offset = r600_resource(rctx->append_fence)->gpu_address;
> +	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOS, 3, 0) | pkt_flags);
> +	radeon_emit(cs, EVENT_TYPE(event) | EVENT_INDEX(6));
> +	radeon_emit(cs, dst_offset & 0xffffffff);
> +	radeon_emit(cs, (2 << 29) | ((dst_offset >> 32) & 0xff));
> +	radeon_emit(cs, rctx->append_fence_id);
> +	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
> +	radeon_emit(cs, reloc);
> +
> +	radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0) | pkt_flags);
> +	radeon_emit(cs, WAIT_REG_MEM_GEQUAL | WAIT_REG_MEM_MEMORY | (1 << 8));
> +	radeon_emit(cs, dst_offset & 0xffffffff);
> +	radeon_emit(cs, ((dst_offset >> 32) & 0xff));
> +	radeon_emit(cs, rctx->append_fence_id);
> +	radeon_emit(cs, 0xffffffff);
> +	radeon_emit(cs, 0xa);
> +	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
> +	radeon_emit(cs, reloc);
> +}
> diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
> index c6de3ee..6f693d6 100644
> --- a/src/gallium/drivers/r600/r600_pipe.c
> +++ b/src/gallium/drivers/r600/r600_pipe.c
> @@ -74,6 +74,8 @@ static void r600_destroy_context(struct pipe_context *context)
>   	r600_resource_reference(&rctx->dummy_cmask, NULL);
>   	r600_resource_reference(&rctx->dummy_fmask, NULL);
>   
> +	if (rctx->append_fence)
> +		pipe_resource_reference((struct pipe_resource**)&rctx->append_fence, NULL);
>   	for (sh = 0; sh < PIPE_SHADER_TYPES; sh++) {
>   		rctx->b.b.set_constant_buffer(&rctx->b.b, sh, R600_BUFFER_INFO_CONST_BUFFER, NULL);
>   		free(rctx->driver_consts[sh].constants);
> @@ -186,6 +188,9 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen,
>   					   rctx->b.family == CHIP_CAICOS ||
>   					   rctx->b.family == CHIP_CAYMAN ||
>   					   rctx->b.family == CHIP_ARUBA);
> +
> +		rctx->append_fence = pipe_buffer_create(rctx->b.b.screen, PIPE_BIND_CUSTOM,
> +							 PIPE_USAGE_DEFAULT, 32);
>   		break;
>   	default:
>   		R600_ERR("Unsupported chip class %d.\n", rctx->b.chip_class);
> @@ -605,8 +610,17 @@ static int r600_get_shader_param(struct pipe_screen* pscreen,
>   	case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
>   	case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
>   	case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
> +		return 0;
>   	case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
> +		if (rscreen->b.family >= CHIP_CEDAR && rscreen->has_atomics)
> +			return 8;
> +		return 0;
>   	case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
> +		/* having to allocate the atomics out amongst shaders stages is messy,
> +		   so give compute 8 buffers and all the others one */
> +		if (rscreen->b.family >= CHIP_CEDAR && rscreen->has_atomics) {
> +			return EG_MAX_ATOMIC_BUFFERS;
> +		}
>   		return 0;
>   	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
>   		/* due to a bug in the shader compiler, some loops hang
> @@ -741,6 +755,7 @@ struct pipe_screen *r600_screen_create(struct radeon_winsys *ws,
>   	/* Create the auxiliary context. This must be done last. */
>   	rscreen->b.aux_context = rscreen->b.b.context_create(&rscreen->b.b, NULL, 0);
>   
> +	rscreen->has_atomics = rscreen->b.info.drm_minor >= 44;
>   #if 0 /* This is for testing whether aux_context and buffer clearing work correctly. */
>   	struct pipe_resource templ = {};
>   
> diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
> index 0d2551a..3dae56e 100644
> --- a/src/gallium/drivers/r600/r600_pipe.h
> +++ b/src/gallium/drivers/r600/r600_pipe.h
> @@ -64,6 +64,8 @@
>   #define R600_MAX_DRIVER_CONST_BUFFERS 3
>   #define R600_MAX_CONST_BUFFERS (R600_MAX_USER_CONST_BUFFERS + R600_MAX_DRIVER_CONST_BUFFERS)
>   
> +#define EG_MAX_ATOMIC_BUFFERS 8
> +
>   /* start driver buffers after user buffers */
>   #define R600_BUFFER_INFO_CONST_BUFFER (R600_MAX_USER_CONST_BUFFERS)
>   #define R600_UCP_SIZE (4*4*8)
> @@ -247,6 +249,7 @@ struct r600_screen {
>   	struct r600_common_screen	b;
>   	bool				has_msaa;
>   	bool				has_compressed_msaa_texturing;
> +	bool				has_atomics;
>   
>   	/*for compute global memory binding, we allocate stuff here, instead of
>   	 * buffers.
> @@ -416,6 +419,12 @@ struct r600_shader_state {
>   	struct r600_pipe_shader *shader;
>   };
>   
> +struct r600_atomic_buffer_state {
> +	uint32_t enabled_mask;
> +	uint32_t dirty_mask;
> +	struct pipe_shader_buffer buffer[EG_MAX_ATOMIC_BUFFERS];
> +};
> +
>   struct r600_context {
>   	struct r600_common_context	b;
>   	struct r600_screen		*screen;
> @@ -470,6 +479,7 @@ struct r600_context {
>   	struct r600_config_state	config_state;
>   	struct r600_stencil_ref_state	stencil_ref;
>   	struct r600_vgt_state		vgt_state;
> +	struct r600_atomic_buffer_state atomic_buffer_state;
>   	/* Shaders and shader resources. */
>   	struct r600_cso_state		vertex_fetch_shader;
>   	struct r600_shader_state        hw_shader_stages[EG_NUM_HW_STAGES];
> @@ -531,6 +541,9 @@ struct r600_context {
>   	struct r600_resource	*last_trace_buf;
>   	struct r600_resource	*trace_buf;
>   	unsigned		trace_id;
> +
> +	struct pipe_resource *append_fence;
> +	uint32_t append_fence_id;
>   };
>   
>   static inline void r600_emit_command_buffer(struct radeon_winsys_cs *cs,
> @@ -959,4 +972,13 @@ unsigned r600_conv_prim_to_gs_out(unsigned mode);
>   void eg_trace_emit(struct r600_context *rctx);
>   void eg_dump_debug_state(struct pipe_context *ctx, FILE *f,
>   			 unsigned flags);
> +
> +struct r600_shader_atomic;
> +bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
> +					struct r600_shader_atomic *combined_atomics,
> +					uint8_t *atomic_used_mask_p);
> +void evergreen_emit_atomic_buffer_save(struct r600_context *rctx,
> +				       struct r600_shader_atomic *combined_atomics,
> +				       uint8_t *atomic_used_mask_p);
> +
>   #endif
> diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
> index 188fbc9..af866c4 100644
> --- a/src/gallium/drivers/r600/r600_shader.c
> +++ b/src/gallium/drivers/r600/r600_shader.c
> @@ -194,6 +194,8 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
>   	/* disable SB for shaders using doubles */
>   	use_sb &= !shader->shader.uses_doubles;
>   
> +	use_sb &= !shader->shader.uses_atomics;
> +
>   	/* Check if the bytecode has already been built. */
>   	if (!shader->shader.bc.bytecode) {
>   		r = r600_bytecode_build(&shader->shader.bc);
> @@ -407,6 +409,7 @@ static int tgsi_is_supported(struct r600_shader_ctx *ctx)
>   		if (i->Src[j].Register.Dimension) {
>   		   switch (i->Src[j].Register.File) {
>   		   case TGSI_FILE_CONSTANT:
> +		   case TGSI_FILE_HW_ATOMIC:
>   			   break;
>   		   case TGSI_FILE_INPUT:
>   			   if (ctx->type == PIPE_SHADER_GEOMETRY ||
> @@ -966,6 +969,17 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
>   	case TGSI_FILE_ADDRESS:
>   		break;
>   
> +	case TGSI_FILE_HW_ATOMIC:
> +		i = ctx->shader->nhwatomic_ranges;
> +		ctx->shader->atomics[i].start = d->Range.First;
> +		ctx->shader->atomics[i].end = d->Range.Last;
> +		ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic;
> +		ctx->shader->atomics[i].array_id = d->Array.ArrayID;
> +		ctx->shader->atomics[i].buffer_id = d->Dim.Index2D;
> +		ctx->shader->nhwatomic_ranges++;
> +		ctx->shader->nhwatomic += count;
> +		break;
> +
>   	case TGSI_FILE_SYSTEM_VALUE:
>   		if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
>   			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
> @@ -2946,6 +2960,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
>   	shader->indirect_files = ctx.info.indirect_files;
>   
>   	shader->uses_doubles = ctx.info.uses_doubles;
> +	shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
>   	shader->nsys_inputs = 0;
>   
>   	indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
> @@ -2959,6 +2974,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
>   		shader->vs_as_gs_a = key.vs.as_gs_a;
>   		shader->vs_as_es = key.vs.as_es;
>   		shader->vs_as_ls = key.vs.as_ls;
> +		shader->atomic_base = key.vs.first_atomic_counter;
>   		if (shader->vs_as_es)
>   			ring_outputs = true;
>   		if (shader->vs_as_ls)
> @@ -2966,20 +2982,24 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
>   		break;
>   	case PIPE_SHADER_GEOMETRY:
>   		ring_outputs = true;
> +		shader->atomic_base = key.gs.first_atomic_counter;
>   		break;
>   	case PIPE_SHADER_TESS_CTRL:
>   		shader->tcs_prim_mode = key.tcs.prim_mode;
> +		shader->atomic_base = key.tcs.first_atomic_counter;
>   		lds_outputs = true;
>   		lds_inputs = true;
>   		break;
>   	case PIPE_SHADER_TESS_EVAL:
>   		shader->tes_as_es = key.tes.as_es;
> +		shader->atomic_base = key.tes.first_atomic_counter;
>   		lds_inputs = true;
>   		if (shader->tes_as_es)
>   			ring_outputs = true;
>   		break;
>   	case PIPE_SHADER_FRAGMENT:
>   		shader->two_side = key.ps.color_two_side;
> +		shader->atomic_base = key.ps.first_atomic_counter;
>   		break;
>   	default:
>   		break;
> @@ -7533,6 +7553,181 @@ static int tgsi_tex(struct r600_shader_ctx *ctx)
>   	return 0;
>   }
>   
> +static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
> +				  struct tgsi_full_src_register *src)
> +{
> +	int i;
> +
> +	if (src->Register.Indirect) {
> +		for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
> +			if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id)
> +				return ctx->shader->atomics[i].hw_idx;
> +		}
> +	} else {
> +		uint32_t index = src->Register.Index;
> +		for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
> +			if (ctx->shader->atomics[i].buffer_id != src->Dimension.Index)
> +				continue;
> +			if (index > ctx->shader->atomics[i].end)
> +				continue;
> +			if (index < ctx->shader->atomics[i].start)
> +				continue;
> +			uint32_t offset = (index - ctx->shader->atomics[i].start);
> +			return ctx->shader->atomics[i].hw_idx + offset;
> +		}
> +	}
> +	assert(0);
> +	return -1;
> +}
> +
> +
> +static int tgsi_load_gds(struct r600_shader_ctx *ctx)
> +{
> +	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
> +	int r;
> +	struct r600_bytecode_gds gds;
> +	int uav_id = 0;
> +	int uav_index_mode = 0;
> +
> +	uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
> +
> +	if (inst->Src[0].Register.Indirect)
> +		uav_index_mode = 2;
> +
> +	memset(&gds, 0, sizeof(struct r600_bytecode_gds));
> +	gds.op = FETCH_OP_GDS_READ_RET;
> +	gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
> +	gds.uav_id = uav_id;
> +	gds.uav_index_mode = uav_index_mode;
> +	gds.src_gpr = ctx->temp_reg;
> +	gds.src_sel_x = 4;
> +	gds.src_sel_y = 4;
> +	gds.src_sel_z = 4;
> +	gds.dst_sel_x = 0;
> +	gds.dst_sel_y = 7;
> +	gds.dst_sel_z = 7;
> +	gds.dst_sel_w = 7;
> +	gds.src_gpr2 = ctx->temp_reg;
> +	gds.alloc_consume = 1;
> +	r = r600_bytecode_add_gds(ctx->bc, &gds);
> +	if (r)
> +		return r;
> +
> +	ctx->bc->cf_last->vpm = 1;
> +	return 0;
> +}
> +
> +static int tgsi_load(struct r600_shader_ctx *ctx)
> +{
> +	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
> +	if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
> +		return tgsi_load_gds(ctx);
> +	return 0;
> +}
> +
> +static int get_gds_op(int opcode)
> +{
> +	switch (opcode) {
> +	case TGSI_OPCODE_ATOMUADD:
> +		return FETCH_OP_GDS_ADD_RET;
> +	case TGSI_OPCODE_ATOMAND:
> +		return FETCH_OP_GDS_AND_RET;
> +	case TGSI_OPCODE_ATOMOR:
> +		return FETCH_OP_GDS_OR_RET;
> +	case TGSI_OPCODE_ATOMXOR:
> +		return FETCH_OP_GDS_XOR_RET;
> +	case TGSI_OPCODE_ATOMUMIN:
> +		return FETCH_OP_GDS_MIN_UINT_RET;
> +	case TGSI_OPCODE_ATOMUMAX:
> +		return FETCH_OP_GDS_MAX_UINT_RET;
> +	case TGSI_OPCODE_ATOMXCHG:
> +		return FETCH_OP_GDS_XCHG_RET;
> +	case TGSI_OPCODE_ATOMCAS:
> +		return FETCH_OP_GDS_CMP_XCHG_RET;
> +	default:
> +		return -1;
> +	}
> +}
> +
> +static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
> +{
> +	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
> +	struct r600_bytecode_gds gds;
> +	struct r600_bytecode_alu alu;
> +	int gds_op = get_gds_op(inst->Instruction.Opcode);
> +	int r;
> +	int uav_id = 0;
> +	int uav_index_mode = 0;
> +
> +	if (gds_op == -1) {
> +		fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
> +		return -1;
> +	}
> +
> +	uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
> +
> +	if (inst->Src[0].Register.Indirect)
> +		uav_index_mode = 2;
> +
> +	if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
> +		int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
> +		int abs_value = abs(value);
> +		if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET)
> +			gds_op = FETCH_OP_GDS_SUB_RET;
> +		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +		alu.op = ALU_OP1_MOV;
> +		alu.dst.sel = ctx->temp_reg;
> +		alu.dst.chan = 0;
> +		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
> +		alu.src[0].value = abs_value;
> +		alu.last = 1;
> +		alu.dst.write = 1;
> +		r = r600_bytecode_add_alu(ctx->bc, &alu);
> +		if (r)
> +			return r;
> +	} else {
> +		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +		alu.op = ALU_OP1_MOV;
> +		alu.dst.sel = ctx->temp_reg;
> +		alu.dst.chan = 0;
> +		r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
> +		alu.last = 1;
> +		alu.dst.write = 1;
> +		r = r600_bytecode_add_alu(ctx->bc, &alu);
> +		if (r)
> +			return r;
> +	}
> +
> +	memset(&gds, 0, sizeof(struct r600_bytecode_gds));
> +	gds.op = gds_op;
> +	gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
> +	gds.uav_id = uav_id;
> +	gds.uav_index_mode = uav_index_mode;
> +	gds.src_gpr = ctx->temp_reg;
> +	gds.src_gpr2 = ctx->temp_reg;
> +	gds.src_sel_x = 4;
> +	gds.src_sel_y = 0;
> +	gds.src_sel_z = 4;
> +	gds.dst_sel_x = 0;
> +	gds.dst_sel_y = 7;
> +	gds.dst_sel_z = 7;
> +	gds.dst_sel_w = 7;
> +	gds.alloc_consume = 1;
> +	r = r600_bytecode_add_gds(ctx->bc, &gds);
> +	if (r)
> +		return r;
> +	ctx->bc->cf_last->vpm = 1;
> +	return 0;
> +}
> +
> +static int tgsi_atomic_op(struct r600_shader_ctx *ctx)
> +{
> +	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
> +	if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
> +		return tgsi_atomic_op_gds(ctx);
> +	return 0;
> +}
> +
>   static int tgsi_lrp(struct r600_shader_ctx *ctx)
>   {
>   	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
> @@ -9190,22 +9385,22 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] =
>   	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
>   	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
>   	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
> -	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
> +	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_load},
>   	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
>   	[163]	= { ALU_OP0_NOP, tgsi_unsupported},
>   	[164]	= { ALU_OP0_NOP, tgsi_unsupported},
>   	[165]	= { ALU_OP0_NOP, tgsi_unsupported},
>   	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
> -	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
> -	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
> -	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
> -	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
> -	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
> -	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
> -	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
> -	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
> -	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
> -	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
> +	[TGSI_OPCODE_ATOMUADD]	= { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
> +	[TGSI_OPCODE_ATOMXCHG]	= { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
> +	[TGSI_OPCODE_ATOMCAS]	= { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
> +	[TGSI_OPCODE_ATOMAND]	= { V_RAT_INST_AND_RTN, tgsi_atomic_op},
> +	[TGSI_OPCODE_ATOMOR]	= { V_RAT_INST_OR_RTN, tgsi_atomic_op},
> +	[TGSI_OPCODE_ATOMXOR]	= { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
> +	[TGSI_OPCODE_ATOMUMIN]	= { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
> +	[TGSI_OPCODE_ATOMUMAX]	= { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
> +	[TGSI_OPCODE_ATOMIMIN]	= { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
> +	[TGSI_OPCODE_ATOMIMAX]	= { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
>   	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
>   	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
>   	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
> @@ -9413,22 +9608,22 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] =
>   	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
>   	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
>   	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
> -	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
> +	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_load},
>   	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
>   	[163]	= { ALU_OP0_NOP, tgsi_unsupported},
>   	[164]	= { ALU_OP0_NOP, tgsi_unsupported},
>   	[165]	= { ALU_OP0_NOP, tgsi_unsupported},
>   	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
> -	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
> -	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
> -	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
> -	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
> -	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
> -	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
> -	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
> -	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
> -	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
> -	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
> +	[TGSI_OPCODE_ATOMUADD]	= { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
> +	[TGSI_OPCODE_ATOMXCHG]	= { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
> +	[TGSI_OPCODE_ATOMCAS]	= { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
> +	[TGSI_OPCODE_ATOMAND]	= { V_RAT_INST_AND_RTN, tgsi_atomic_op},
> +	[TGSI_OPCODE_ATOMOR]	= { V_RAT_INST_OR_RTN, tgsi_atomic_op},
> +	[TGSI_OPCODE_ATOMXOR]	= { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
> +	[TGSI_OPCODE_ATOMUMIN]	= { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
> +	[TGSI_OPCODE_ATOMUMAX]	= { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
> +	[TGSI_OPCODE_ATOMIMIN]	= { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
> +	[TGSI_OPCODE_ATOMIMAX]	= { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
>   	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
>   	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
>   	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
> diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
> index 9032d50..3fecda4 100644
> --- a/src/gallium/drivers/r600/r600_shader.h
> +++ b/src/gallium/drivers/r600/r600_shader.h
> @@ -56,15 +56,25 @@ struct r600_shader_io {
>   	int			ring_offset;
>   };
>   
> +struct r600_shader_atomic {
> +	unsigned start, end;
> +	unsigned buffer_id;
> +	unsigned hw_idx;
> +	unsigned array_id;
> +};
> +
>   struct r600_shader {
>   	unsigned		processor_type;
>   	struct r600_bytecode		bc;
>   	unsigned		ninput;
>   	unsigned		noutput;
> +	unsigned                nhwatomic;
>   	unsigned		nlds;
>   	unsigned		nsys_inputs;
>   	struct r600_shader_io	input[64];
>   	struct r600_shader_io	output[64];
> +	struct r600_shader_atomic atomics[8];
> +	unsigned                nhwatomic_ranges;
>   	boolean			uses_kill;
>   	boolean			fs_write_all;
>   	boolean			two_side;
> @@ -105,26 +115,35 @@ struct r600_shader {
>   	struct r600_shader_array * arrays;
>   
>   	boolean			uses_doubles;
> +	boolean                 uses_atomics;
> +	uint8_t                 atomic_base;
>   };
>   
>   union r600_shader_key {
>   	struct {
>   		unsigned	nr_cbufs:4;
> +		unsigned        first_atomic_counter:4;
>   		unsigned	color_two_side:1;
>   		unsigned	alpha_to_one:1;
>   	} ps;
>   	struct {
>   		unsigned	prim_id_out:8;
> +		unsigned        first_atomic_counter:4;
>   		unsigned	as_es:1; /* export shader */
>   		unsigned	as_ls:1; /* local shader */
>   		unsigned	as_gs_a:1;
>   	} vs;
>   	struct {
> +		unsigned        first_atomic_counter:4;
>   		unsigned	as_es:1;
>   	} tes;
>   	struct {
> +		unsigned        first_atomic_counter:4;
>   		unsigned	prim_mode:3;
>   	} tcs;
> +	struct {
> +		unsigned        first_atomic_counter:4;
> +	} gs;
>   };
>   
>   struct r600_shader_array {
> diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
> index 0e8c5d6..750fd41 100644
> --- a/src/gallium/drivers/r600/r600_state_common.c
> +++ b/src/gallium/drivers/r600/r600_state_common.c
> @@ -698,6 +698,38 @@ static void r600_update_compressed_colortex_mask(struct r600_samplerview_state *
>   	}
>   }
>   
> +static int r600_get_hw_atomic_count(const struct pipe_context *ctx,
> +				    enum pipe_shader_type shader)
> +{
> +	const struct r600_context *rctx = (struct r600_context *)ctx;
> +	int value = 0;
> +	switch (shader) {
> +	case PIPE_SHADER_FRAGMENT:
> +	case PIPE_SHADER_COMPUTE:
> +	default:
> +		break;
> +	case PIPE_SHADER_VERTEX:
> +		value = rctx->ps_shader->info.file_count[TGSI_FILE_HW_ATOMIC];
> +		break;
> +	case PIPE_SHADER_GEOMETRY:
> +		value = rctx->ps_shader->info.file_count[TGSI_FILE_HW_ATOMIC] +
> +			rctx->vs_shader->info.file_count[TGSI_FILE_HW_ATOMIC];
> +		break;
> +	case PIPE_SHADER_TESS_EVAL:
> +		value = rctx->ps_shader->info.file_count[TGSI_FILE_HW_ATOMIC] +
> +			rctx->vs_shader->info.file_count[TGSI_FILE_HW_ATOMIC] +
> +			(rctx->gs_shader ? rctx->gs_shader->info.file_count[TGSI_FILE_HW_ATOMIC] : 0);
> +		break;
> +	case PIPE_SHADER_TESS_CTRL:
> +		value = rctx->ps_shader->info.file_count[TGSI_FILE_HW_ATOMIC] +
> +			rctx->vs_shader->info.file_count[TGSI_FILE_HW_ATOMIC] +
> +			(rctx->gs_shader ? rctx->gs_shader->info.file_count[TGSI_FILE_HW_ATOMIC] : 0) +
> +			rctx->tes_shader->info.file_count[TGSI_FILE_HW_ATOMIC];
> +		break;
> +	}
> +	return value;
> +}
> +
>   /* Compute the key for the hw shader variant */
>   static inline void r600_shader_selector_key(const struct pipe_context *ctx,
>   		const struct r600_pipe_shader_selector *sel,
> @@ -716,11 +748,14 @@ static inline void r600_shader_selector_key(const struct pipe_context *ctx,
>   			key->vs.as_gs_a = true;
>   			key->vs.prim_id_out = rctx->ps_shader->current->shader.input[rctx->ps_shader->current->shader.ps_prim_id_input].spi_sid;
>   		}
> +		key->vs.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_VERTEX);
>   		break;
>   	}
>   	case PIPE_SHADER_GEOMETRY:
> +		key->gs.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_GEOMETRY);
>   		break;
>   	case PIPE_SHADER_FRAGMENT: {
> +		key->ps.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_FRAGMENT);
>   		key->ps.color_two_side = rctx->rasterizer && rctx->rasterizer->two_side;
>   		key->ps.alpha_to_one = rctx->alpha_to_one &&
>   				      rctx->rasterizer && rctx->rasterizer->multisample_enable &&
> @@ -733,9 +768,11 @@ static inline void r600_shader_selector_key(const struct pipe_context *ctx,
>   	}
>   	case PIPE_SHADER_TESS_EVAL:
>   		key->tes.as_es = (rctx->gs_shader != NULL);
> +		key->tes.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_TESS_EVAL);
>   		break;
>   	case PIPE_SHADER_TESS_CTRL:
>   		key->tcs.prim_mode = rctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
> +		key->tcs.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_TESS_CTRL);
>   		break;
>   	default:
>   		assert(0);
> @@ -1700,6 +1737,8 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
>   	unsigned num_patches, dirty_tex_counter, index_offset = 0;
>   	unsigned index_size = info->index_size;
>   	int index_bias;
> +	struct r600_shader_atomic combined_atomics[8];
> +	uint8_t atomic_used_mask;
>   
>   	if (!info->indirect && !info->count && (index_size || !info->count_from_stream_output)) {
>   		return;
> @@ -1739,6 +1778,9 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
>   		: (rctx->tes_shader)? rctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]
>   		: info->mode;
>   
> +	if (rctx->b.chip_class >= EVERGREEN)
> +		evergreen_emit_atomic_buffer_setup(rctx, combined_atomics, &atomic_used_mask);
> +
>   	if (index_size) {
>   		index_offset += info->start * index_size;
>   
> @@ -2019,6 +2061,10 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
>   		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SQ_NON_EVENT));
>   	}
>   
> +
> +	if (rctx->b.chip_class >= EVERGREEN)
> +		evergreen_emit_atomic_buffer_save(rctx, combined_atomics, &atomic_used_mask);
> +
>   	if (rctx->trace_buf)
>   		eg_trace_emit(rctx);
>   
> diff --git a/src/gallium/drivers/r600/r600d_common.h b/src/gallium/drivers/r600/r600d_common.h
> index ed1d460..b06f90f 100644
> --- a/src/gallium/drivers/r600/r600d_common.h
> +++ b/src/gallium/drivers/r600/r600d_common.h
> @@ -51,6 +51,8 @@
>   #define		STRMOUT_SELECT_BUFFER(x)	(((unsigned)(x) & 0x3) << 8)
>   #define PKT3_WAIT_REG_MEM                      0x3C
>   #define		WAIT_REG_MEM_EQUAL		3
> +#define		WAIT_REG_MEM_GEQUAL		5
> +#define		WAIT_REG_MEM_MEMORY		(1 << 4)
>   #define         WAIT_REG_MEM_MEM_SPACE(x)       (((unsigned)(x) & 0x3) << 4)
>   #define PKT3_COPY_DATA			       0x40
>   #define		COPY_DATA_SRC_SEL(x)		((x) & 0xf)
> 

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.