[Mesa-dev] [PATCH 8/8] r600: add support for hw atomic counters. (v3)
Nicolai Hähnle
nhaehnle at gmail.com
Tue Nov 7 17:26:13 UTC 2017
On 07.11.2017 07:31, Dave Airlie wrote:
> From: Dave Airlie <airlied at redhat.com>
>
> This adds support for the evergreen/cayman atomic counters.
>
> These are implemented using GDS append/consume counters. The values
> for each counter are loaded before drawing and saved after each draw
> using special CP packets.
I admit I'm a bit confused by this at the hardware level.
My understanding of GDS is that it's mostly another copy of LDS (but
global), and all GDS instructions are atomic by default. There is extra
append-consume hardware, but it's main point is to support use cases
where operations have to be ordered by wave, or where a wave return is
supposed to be blocked (for producer/consumer kernels and ring buffer
management).
So this should really work without the append/consume counters as well,
just with regular GDS memory. Is there a particular reason why you
haven't done that? I suppose it might require more stuff to manage GDS
allocations in the kernel, and if it works with this approach...
Acked-by: Nicolai Hähnle <nicolai.haehnle at amd.com>
>
> v2: move hw atomic assignment into driver.
> v3: fix messing up caps (Gert Wollny), only store ranges in driver,
> drop buffers.
>
> Signed-off-by: Dave Airlie <airlied at redhat.com>
> ---
> src/gallium/drivers/r600/evergreen_state.c | 159 ++++++++++++++++++
> src/gallium/drivers/r600/r600_pipe.c | 15 ++
> src/gallium/drivers/r600/r600_pipe.h | 22 +++
> src/gallium/drivers/r600/r600_shader.c | 239 ++++++++++++++++++++++++---
> src/gallium/drivers/r600/r600_shader.h | 19 +++
> src/gallium/drivers/r600/r600_state_common.c | 46 ++++++
> src/gallium/drivers/r600/r600d_common.h | 2 +
> 7 files changed, 480 insertions(+), 22 deletions(-)
>
> diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
> index 96eb35a..634cd96 100644
> --- a/src/gallium/drivers/r600/evergreen_state.c
> +++ b/src/gallium/drivers/r600/evergreen_state.c
> @@ -3716,6 +3716,38 @@ static void evergreen_set_tess_state(struct pipe_context *ctx,
> rctx->tess_state_dirty = true;
> }
>
> +static void evergreen_set_hw_atomic_buffers(struct pipe_context *ctx,
> + unsigned start_slot,
> + unsigned count,
> + const struct pipe_shader_buffer *buffers)
> +{
> + struct r600_context *rctx = (struct r600_context *)ctx;
> + struct r600_atomic_buffer_state *astate;
> + int i, idx;
> +
> + astate = &rctx->atomic_buffer_state;
> +
> + /* we'd probably like to expand this to 8 later so put the logic in */
> + for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
> + const struct pipe_shader_buffer *buf;
> + struct pipe_shader_buffer *abuf;
> +
> + abuf = &astate->buffer[i];
> +
> + if (!buffers || !buffers[idx].buffer) {
> + pipe_resource_reference(&abuf->buffer, NULL);
> + astate->enabled_mask &= ~(1 << i);
> + continue;
> + }
> + buf = &buffers[idx];
> +
> + pipe_resource_reference(&abuf->buffer, buf->buffer);
> + abuf->buffer_offset = buf->buffer_offset;
> + abuf->buffer_size = buf->buffer_size;
> + astate->enabled_mask |= (1 << i);
> + }
> +}
> +
> void evergreen_init_state_functions(struct r600_context *rctx)
> {
> unsigned id = 1;
> @@ -3801,6 +3833,7 @@ void evergreen_init_state_functions(struct r600_context *rctx)
> rctx->b.b.set_polygon_stipple = evergreen_set_polygon_stipple;
> rctx->b.b.set_min_samples = evergreen_set_min_samples;
> rctx->b.b.set_tess_state = evergreen_set_tess_state;
> + rctx->b.b.set_hw_atomic_buffers = evergreen_set_hw_atomic_buffers;
> if (rctx->b.chip_class == EVERGREEN)
> rctx->b.b.get_sample_position = evergreen_get_sample_position;
> else
> @@ -4107,3 +4140,129 @@ void eg_trace_emit(struct r600_context *rctx)
> radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
> radeon_emit(cs, AC_ENCODE_TRACE_POINT(rctx->trace_id));
> }
> +
> +bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
> + struct r600_shader_atomic *combined_atomics,
> + uint8_t *atomic_used_mask_p)
> +{
> + struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
> + struct r600_atomic_buffer_state *astate = &rctx->atomic_buffer_state;
> + unsigned pkt_flags = 0;
> + uint8_t atomic_used_mask = 0;
> + int i, j, k;
> +
> + for (i = 0; i < EG_NUM_HW_STAGES; i++) {
> + uint8_t num_atomic_stage;
> + struct r600_pipe_shader *pshader;
> +
> + pshader = rctx->hw_shader_stages[i].shader;
> + if (!pshader)
> + continue;
> +
> + num_atomic_stage = pshader->shader.nhwatomic_ranges;
> + if (!num_atomic_stage)
> + continue;
> +
> + for (j = 0; j < num_atomic_stage; j++) {
> + struct r600_shader_atomic *atomic = &pshader->shader.atomics[j];
> + int natomics = atomic->end - atomic->start + 1;
> +
> + for (k = 0; k < natomics; k++) {
> + /* seen this in a previous stage */
> + if (atomic_used_mask & (1u << (atomic->hw_idx + k)))
> + continue;
> +
> + combined_atomics[atomic->hw_idx + k].hw_idx = atomic->hw_idx + k;
> + combined_atomics[atomic->hw_idx + k].buffer_id = atomic->buffer_id;
> + combined_atomics[atomic->hw_idx + k].start = atomic->start + k;
> + combined_atomics[atomic->hw_idx + k].end = combined_atomics[atomic->hw_idx + k].start + 1;
> + atomic_used_mask |= (1u << (atomic->hw_idx + k));
> + }
> + }
> + }
> +
> + uint32_t mask = atomic_used_mask;
> + while (mask) {
> + unsigned atomic_index = u_bit_scan(&mask);
> + struct r600_shader_atomic *atomic = &combined_atomics[atomic_index];
> + struct r600_resource *resource = r600_resource(astate->buffer[atomic->buffer_id].buffer);
> + assert(resource);
> + unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
> + resource,
> + RADEON_USAGE_READ,
> + RADEON_PRIO_SHADER_RW_BUFFER);
> + uint64_t dst_offset = resource->gpu_address + (atomic->start * 4);
> + uint32_t base_reg_0 = R_02872C_GDS_APPEND_COUNT_0;
> +
> + uint32_t reg_val = (base_reg_0 + atomic->hw_idx * 4 - EVERGREEN_CONTEXT_REG_OFFSET) >> 2;
> +
> + radeon_emit(cs, PKT3(PKT3_SET_APPEND_CNT, 2, 0) | pkt_flags);
> + radeon_emit(cs, (reg_val << 16) | 0x3);
> + radeon_emit(cs, dst_offset & 0xfffffffc);
> + radeon_emit(cs, (dst_offset >> 32) & 0xff);
> + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
> + radeon_emit(cs, reloc);
> + }
> + *atomic_used_mask_p = atomic_used_mask;
> + return true;
> +}
> +
> +void evergreen_emit_atomic_buffer_save(struct r600_context *rctx,
> + struct r600_shader_atomic *combined_atomics,
> + uint8_t *atomic_used_mask_p)
> +{
> + struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
> + struct r600_atomic_buffer_state *astate = &rctx->atomic_buffer_state;
> + uint32_t pkt_flags = 0;
> + uint32_t event = EVENT_TYPE_PS_DONE;
> + uint32_t mask = astate->enabled_mask;
> + uint64_t dst_offset;
> + unsigned reloc;
> +
> + mask = *atomic_used_mask_p;
> + while (mask) {
> + unsigned atomic_index = u_bit_scan(&mask);
> + struct r600_shader_atomic *atomic = &combined_atomics[atomic_index];
> + struct r600_resource *resource = r600_resource(astate->buffer[atomic->buffer_id].buffer);
> + assert(resource);
> +
> + uint32_t base_reg_0 = R_02872C_GDS_APPEND_COUNT_0;
> + reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
> + resource,
> + RADEON_USAGE_WRITE,
> + RADEON_PRIO_SHADER_RW_BUFFER);
> + dst_offset = resource->gpu_address + (atomic->start * 4);
> + uint32_t reg_val = (base_reg_0 + atomic->hw_idx * 4) >> 2;
> +
> + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOS, 3, 0) | pkt_flags);
> + radeon_emit(cs, EVENT_TYPE(event) | EVENT_INDEX(6));
> + radeon_emit(cs, (dst_offset) & 0xffffffff);
> + radeon_emit(cs, (0 << 29) | ((dst_offset >> 32) & 0xff));
> + radeon_emit(cs, reg_val);
> + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
> + radeon_emit(cs, reloc);
> + }
> + ++rctx->append_fence_id;
> + reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
> + r600_resource(rctx->append_fence),
> + RADEON_USAGE_READWRITE,
> + RADEON_PRIO_SHADER_RW_BUFFER);
> + dst_offset = r600_resource(rctx->append_fence)->gpu_address;
> + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOS, 3, 0) | pkt_flags);
> + radeon_emit(cs, EVENT_TYPE(event) | EVENT_INDEX(6));
> + radeon_emit(cs, dst_offset & 0xffffffff);
> + radeon_emit(cs, (2 << 29) | ((dst_offset >> 32) & 0xff));
> + radeon_emit(cs, rctx->append_fence_id);
> + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
> + radeon_emit(cs, reloc);
> +
> + radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0) | pkt_flags);
> + radeon_emit(cs, WAIT_REG_MEM_GEQUAL | WAIT_REG_MEM_MEMORY | (1 << 8));
> + radeon_emit(cs, dst_offset & 0xffffffff);
> + radeon_emit(cs, ((dst_offset >> 32) & 0xff));
> + radeon_emit(cs, rctx->append_fence_id);
> + radeon_emit(cs, 0xffffffff);
> + radeon_emit(cs, 0xa);
> + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
> + radeon_emit(cs, reloc);
> +}
> diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
> index c6de3ee..6f693d6 100644
> --- a/src/gallium/drivers/r600/r600_pipe.c
> +++ b/src/gallium/drivers/r600/r600_pipe.c
> @@ -74,6 +74,8 @@ static void r600_destroy_context(struct pipe_context *context)
> r600_resource_reference(&rctx->dummy_cmask, NULL);
> r600_resource_reference(&rctx->dummy_fmask, NULL);
>
> + if (rctx->append_fence)
> + pipe_resource_reference((struct pipe_resource**)&rctx->append_fence, NULL);
> for (sh = 0; sh < PIPE_SHADER_TYPES; sh++) {
> rctx->b.b.set_constant_buffer(&rctx->b.b, sh, R600_BUFFER_INFO_CONST_BUFFER, NULL);
> free(rctx->driver_consts[sh].constants);
> @@ -186,6 +188,9 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen,
> rctx->b.family == CHIP_CAICOS ||
> rctx->b.family == CHIP_CAYMAN ||
> rctx->b.family == CHIP_ARUBA);
> +
> + rctx->append_fence = pipe_buffer_create(rctx->b.b.screen, PIPE_BIND_CUSTOM,
> + PIPE_USAGE_DEFAULT, 32);
> break;
> default:
> R600_ERR("Unsupported chip class %d.\n", rctx->b.chip_class);
> @@ -605,8 +610,17 @@ static int r600_get_shader_param(struct pipe_screen* pscreen,
> case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
> case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
> case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
> + return 0;
> case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
> + if (rscreen->b.family >= CHIP_CEDAR && rscreen->has_atomics)
> + return 8;
> + return 0;
> case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
> + /* having to allocate the atomics out amongst shaders stages is messy,
> + so give compute 8 buffers and all the others one */
> + if (rscreen->b.family >= CHIP_CEDAR && rscreen->has_atomics) {
> + return EG_MAX_ATOMIC_BUFFERS;
> + }
> return 0;
> case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
> /* due to a bug in the shader compiler, some loops hang
> @@ -741,6 +755,7 @@ struct pipe_screen *r600_screen_create(struct radeon_winsys *ws,
> /* Create the auxiliary context. This must be done last. */
> rscreen->b.aux_context = rscreen->b.b.context_create(&rscreen->b.b, NULL, 0);
>
> + rscreen->has_atomics = rscreen->b.info.drm_minor >= 44;
> #if 0 /* This is for testing whether aux_context and buffer clearing work correctly. */
> struct pipe_resource templ = {};
>
> diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
> index 0d2551a..3dae56e 100644
> --- a/src/gallium/drivers/r600/r600_pipe.h
> +++ b/src/gallium/drivers/r600/r600_pipe.h
> @@ -64,6 +64,8 @@
> #define R600_MAX_DRIVER_CONST_BUFFERS 3
> #define R600_MAX_CONST_BUFFERS (R600_MAX_USER_CONST_BUFFERS + R600_MAX_DRIVER_CONST_BUFFERS)
>
> +#define EG_MAX_ATOMIC_BUFFERS 8
> +
> /* start driver buffers after user buffers */
> #define R600_BUFFER_INFO_CONST_BUFFER (R600_MAX_USER_CONST_BUFFERS)
> #define R600_UCP_SIZE (4*4*8)
> @@ -247,6 +249,7 @@ struct r600_screen {
> struct r600_common_screen b;
> bool has_msaa;
> bool has_compressed_msaa_texturing;
> + bool has_atomics;
>
> /*for compute global memory binding, we allocate stuff here, instead of
> * buffers.
> @@ -416,6 +419,12 @@ struct r600_shader_state {
> struct r600_pipe_shader *shader;
> };
>
> +struct r600_atomic_buffer_state {
> + uint32_t enabled_mask;
> + uint32_t dirty_mask;
> + struct pipe_shader_buffer buffer[EG_MAX_ATOMIC_BUFFERS];
> +};
> +
> struct r600_context {
> struct r600_common_context b;
> struct r600_screen *screen;
> @@ -470,6 +479,7 @@ struct r600_context {
> struct r600_config_state config_state;
> struct r600_stencil_ref_state stencil_ref;
> struct r600_vgt_state vgt_state;
> + struct r600_atomic_buffer_state atomic_buffer_state;
> /* Shaders and shader resources. */
> struct r600_cso_state vertex_fetch_shader;
> struct r600_shader_state hw_shader_stages[EG_NUM_HW_STAGES];
> @@ -531,6 +541,9 @@ struct r600_context {
> struct r600_resource *last_trace_buf;
> struct r600_resource *trace_buf;
> unsigned trace_id;
> +
> + struct pipe_resource *append_fence;
> + uint32_t append_fence_id;
> };
>
> static inline void r600_emit_command_buffer(struct radeon_winsys_cs *cs,
> @@ -959,4 +972,13 @@ unsigned r600_conv_prim_to_gs_out(unsigned mode);
> void eg_trace_emit(struct r600_context *rctx);
> void eg_dump_debug_state(struct pipe_context *ctx, FILE *f,
> unsigned flags);
> +
> +struct r600_shader_atomic;
> +bool evergreen_emit_atomic_buffer_setup(struct r600_context *rctx,
> + struct r600_shader_atomic *combined_atomics,
> + uint8_t *atomic_used_mask_p);
> +void evergreen_emit_atomic_buffer_save(struct r600_context *rctx,
> + struct r600_shader_atomic *combined_atomics,
> + uint8_t *atomic_used_mask_p);
> +
> #endif
> diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
> index 188fbc9..af866c4 100644
> --- a/src/gallium/drivers/r600/r600_shader.c
> +++ b/src/gallium/drivers/r600/r600_shader.c
> @@ -194,6 +194,8 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
> /* disable SB for shaders using doubles */
> use_sb &= !shader->shader.uses_doubles;
>
> + use_sb &= !shader->shader.uses_atomics;
> +
> /* Check if the bytecode has already been built. */
> if (!shader->shader.bc.bytecode) {
> r = r600_bytecode_build(&shader->shader.bc);
> @@ -407,6 +409,7 @@ static int tgsi_is_supported(struct r600_shader_ctx *ctx)
> if (i->Src[j].Register.Dimension) {
> switch (i->Src[j].Register.File) {
> case TGSI_FILE_CONSTANT:
> + case TGSI_FILE_HW_ATOMIC:
> break;
> case TGSI_FILE_INPUT:
> if (ctx->type == PIPE_SHADER_GEOMETRY ||
> @@ -966,6 +969,17 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
> case TGSI_FILE_ADDRESS:
> break;
>
> + case TGSI_FILE_HW_ATOMIC:
> + i = ctx->shader->nhwatomic_ranges;
> + ctx->shader->atomics[i].start = d->Range.First;
> + ctx->shader->atomics[i].end = d->Range.Last;
> + ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic;
> + ctx->shader->atomics[i].array_id = d->Array.ArrayID;
> + ctx->shader->atomics[i].buffer_id = d->Dim.Index2D;
> + ctx->shader->nhwatomic_ranges++;
> + ctx->shader->nhwatomic += count;
> + break;
> +
> case TGSI_FILE_SYSTEM_VALUE:
> if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
> d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
> @@ -2946,6 +2960,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
> shader->indirect_files = ctx.info.indirect_files;
>
> shader->uses_doubles = ctx.info.uses_doubles;
> + shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
> shader->nsys_inputs = 0;
>
> indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
> @@ -2959,6 +2974,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
> shader->vs_as_gs_a = key.vs.as_gs_a;
> shader->vs_as_es = key.vs.as_es;
> shader->vs_as_ls = key.vs.as_ls;
> + shader->atomic_base = key.vs.first_atomic_counter;
> if (shader->vs_as_es)
> ring_outputs = true;
> if (shader->vs_as_ls)
> @@ -2966,20 +2982,24 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
> break;
> case PIPE_SHADER_GEOMETRY:
> ring_outputs = true;
> + shader->atomic_base = key.gs.first_atomic_counter;
> break;
> case PIPE_SHADER_TESS_CTRL:
> shader->tcs_prim_mode = key.tcs.prim_mode;
> + shader->atomic_base = key.tcs.first_atomic_counter;
> lds_outputs = true;
> lds_inputs = true;
> break;
> case PIPE_SHADER_TESS_EVAL:
> shader->tes_as_es = key.tes.as_es;
> + shader->atomic_base = key.tes.first_atomic_counter;
> lds_inputs = true;
> if (shader->tes_as_es)
> ring_outputs = true;
> break;
> case PIPE_SHADER_FRAGMENT:
> shader->two_side = key.ps.color_two_side;
> + shader->atomic_base = key.ps.first_atomic_counter;
> break;
> default:
> break;
> @@ -7533,6 +7553,181 @@ static int tgsi_tex(struct r600_shader_ctx *ctx)
> return 0;
> }
>
> +static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
> + struct tgsi_full_src_register *src)
> +{
> + int i;
> +
> + if (src->Register.Indirect) {
> + for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
> + if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id)
> + return ctx->shader->atomics[i].hw_idx;
> + }
> + } else {
> + uint32_t index = src->Register.Index;
> + for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
> + if (ctx->shader->atomics[i].buffer_id != src->Dimension.Index)
> + continue;
> + if (index > ctx->shader->atomics[i].end)
> + continue;
> + if (index < ctx->shader->atomics[i].start)
> + continue;
> + uint32_t offset = (index - ctx->shader->atomics[i].start);
> + return ctx->shader->atomics[i].hw_idx + offset;
> + }
> + }
> + assert(0);
> + return -1;
> +}
> +
> +
> +static int tgsi_load_gds(struct r600_shader_ctx *ctx)
> +{
> + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
> + int r;
> + struct r600_bytecode_gds gds;
> + int uav_id = 0;
> + int uav_index_mode = 0;
> +
> + uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
> +
> + if (inst->Src[0].Register.Indirect)
> + uav_index_mode = 2;
> +
> + memset(&gds, 0, sizeof(struct r600_bytecode_gds));
> + gds.op = FETCH_OP_GDS_READ_RET;
> + gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
> + gds.uav_id = uav_id;
> + gds.uav_index_mode = uav_index_mode;
> + gds.src_gpr = ctx->temp_reg;
> + gds.src_sel_x = 4;
> + gds.src_sel_y = 4;
> + gds.src_sel_z = 4;
> + gds.dst_sel_x = 0;
> + gds.dst_sel_y = 7;
> + gds.dst_sel_z = 7;
> + gds.dst_sel_w = 7;
> + gds.src_gpr2 = ctx->temp_reg;
> + gds.alloc_consume = 1;
> + r = r600_bytecode_add_gds(ctx->bc, &gds);
> + if (r)
> + return r;
> +
> + ctx->bc->cf_last->vpm = 1;
> + return 0;
> +}
> +
> +static int tgsi_load(struct r600_shader_ctx *ctx)
> +{
> + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
> + if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
> + return tgsi_load_gds(ctx);
> + return 0;
> +}
> +
> +static int get_gds_op(int opcode)
> +{
> + switch (opcode) {
> + case TGSI_OPCODE_ATOMUADD:
> + return FETCH_OP_GDS_ADD_RET;
> + case TGSI_OPCODE_ATOMAND:
> + return FETCH_OP_GDS_AND_RET;
> + case TGSI_OPCODE_ATOMOR:
> + return FETCH_OP_GDS_OR_RET;
> + case TGSI_OPCODE_ATOMXOR:
> + return FETCH_OP_GDS_XOR_RET;
> + case TGSI_OPCODE_ATOMUMIN:
> + return FETCH_OP_GDS_MIN_UINT_RET;
> + case TGSI_OPCODE_ATOMUMAX:
> + return FETCH_OP_GDS_MAX_UINT_RET;
> + case TGSI_OPCODE_ATOMXCHG:
> + return FETCH_OP_GDS_XCHG_RET;
> + case TGSI_OPCODE_ATOMCAS:
> + return FETCH_OP_GDS_CMP_XCHG_RET;
> + default:
> + return -1;
> + }
> +}
> +
> +static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
> +{
> + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
> + struct r600_bytecode_gds gds;
> + struct r600_bytecode_alu alu;
> + int gds_op = get_gds_op(inst->Instruction.Opcode);
> + int r;
> + int uav_id = 0;
> + int uav_index_mode = 0;
> +
> + if (gds_op == -1) {
> + fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
> + return -1;
> + }
> +
> + uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
> +
> + if (inst->Src[0].Register.Indirect)
> + uav_index_mode = 2;
> +
> + if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
> + int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
> + int abs_value = abs(value);
> + if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET)
> + gds_op = FETCH_OP_GDS_SUB_RET;
> + memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> + alu.op = ALU_OP1_MOV;
> + alu.dst.sel = ctx->temp_reg;
> + alu.dst.chan = 0;
> + alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
> + alu.src[0].value = abs_value;
> + alu.last = 1;
> + alu.dst.write = 1;
> + r = r600_bytecode_add_alu(ctx->bc, &alu);
> + if (r)
> + return r;
> + } else {
> + memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> + alu.op = ALU_OP1_MOV;
> + alu.dst.sel = ctx->temp_reg;
> + alu.dst.chan = 0;
> + r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
> + alu.last = 1;
> + alu.dst.write = 1;
> + r = r600_bytecode_add_alu(ctx->bc, &alu);
> + if (r)
> + return r;
> + }
> +
> + memset(&gds, 0, sizeof(struct r600_bytecode_gds));
> + gds.op = gds_op;
> + gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
> + gds.uav_id = uav_id;
> + gds.uav_index_mode = uav_index_mode;
> + gds.src_gpr = ctx->temp_reg;
> + gds.src_gpr2 = ctx->temp_reg;
> + gds.src_sel_x = 4;
> + gds.src_sel_y = 0;
> + gds.src_sel_z = 4;
> + gds.dst_sel_x = 0;
> + gds.dst_sel_y = 7;
> + gds.dst_sel_z = 7;
> + gds.dst_sel_w = 7;
> + gds.alloc_consume = 1;
> + r = r600_bytecode_add_gds(ctx->bc, &gds);
> + if (r)
> + return r;
> + ctx->bc->cf_last->vpm = 1;
> + return 0;
> +}
> +
> +static int tgsi_atomic_op(struct r600_shader_ctx *ctx)
> +{
> + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
> + if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
> + return tgsi_atomic_op_gds(ctx);
> + return 0;
> +}
> +
> static int tgsi_lrp(struct r600_shader_ctx *ctx)
> {
> struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
> @@ -9190,22 +9385,22 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] =
> [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
> [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
> [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
> - [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
> + [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
> [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
> [163] = { ALU_OP0_NOP, tgsi_unsupported},
> [164] = { ALU_OP0_NOP, tgsi_unsupported},
> [165] = { ALU_OP0_NOP, tgsi_unsupported},
> [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
> - [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
> - [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
> - [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
> - [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
> - [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
> - [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
> - [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
> - [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
> - [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
> - [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
> + [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
> + [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
> + [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
> + [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
> + [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
> + [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
> + [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
> + [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
> + [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
> + [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
> [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
> [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
> [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
> @@ -9413,22 +9608,22 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] =
> [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
> [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
> [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
> - [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
> + [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
> [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
> [163] = { ALU_OP0_NOP, tgsi_unsupported},
> [164] = { ALU_OP0_NOP, tgsi_unsupported},
> [165] = { ALU_OP0_NOP, tgsi_unsupported},
> [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
> - [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
> - [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
> - [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
> - [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
> - [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
> - [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
> - [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
> - [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
> - [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
> - [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
> + [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
> + [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
> + [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
> + [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
> + [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
> + [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
> + [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
> + [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
> + [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
> + [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
> [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
> [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
> [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
> diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
> index 9032d50..3fecda4 100644
> --- a/src/gallium/drivers/r600/r600_shader.h
> +++ b/src/gallium/drivers/r600/r600_shader.h
> @@ -56,15 +56,25 @@ struct r600_shader_io {
> int ring_offset;
> };
>
> +struct r600_shader_atomic {
> + unsigned start, end;
> + unsigned buffer_id;
> + unsigned hw_idx;
> + unsigned array_id;
> +};
> +
> struct r600_shader {
> unsigned processor_type;
> struct r600_bytecode bc;
> unsigned ninput;
> unsigned noutput;
> + unsigned nhwatomic;
> unsigned nlds;
> unsigned nsys_inputs;
> struct r600_shader_io input[64];
> struct r600_shader_io output[64];
> + struct r600_shader_atomic atomics[8];
> + unsigned nhwatomic_ranges;
> boolean uses_kill;
> boolean fs_write_all;
> boolean two_side;
> @@ -105,26 +115,35 @@ struct r600_shader {
> struct r600_shader_array * arrays;
>
> boolean uses_doubles;
> + boolean uses_atomics;
> + uint8_t atomic_base;
> };
>
> union r600_shader_key {
> struct {
> unsigned nr_cbufs:4;
> + unsigned first_atomic_counter:4;
> unsigned color_two_side:1;
> unsigned alpha_to_one:1;
> } ps;
> struct {
> unsigned prim_id_out:8;
> + unsigned first_atomic_counter:4;
> unsigned as_es:1; /* export shader */
> unsigned as_ls:1; /* local shader */
> unsigned as_gs_a:1;
> } vs;
> struct {
> + unsigned first_atomic_counter:4;
> unsigned as_es:1;
> } tes;
> struct {
> + unsigned first_atomic_counter:4;
> unsigned prim_mode:3;
> } tcs;
> + struct {
> + unsigned first_atomic_counter:4;
> + } gs;
> };
>
> struct r600_shader_array {
> diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
> index 0e8c5d6..750fd41 100644
> --- a/src/gallium/drivers/r600/r600_state_common.c
> +++ b/src/gallium/drivers/r600/r600_state_common.c
> @@ -698,6 +698,38 @@ static void r600_update_compressed_colortex_mask(struct r600_samplerview_state *
> }
> }
>
> +static int r600_get_hw_atomic_count(const struct pipe_context *ctx,
> + enum pipe_shader_type shader)
> +{
> + const struct r600_context *rctx = (struct r600_context *)ctx;
> + int value = 0;
> + switch (shader) {
> + case PIPE_SHADER_FRAGMENT:
> + case PIPE_SHADER_COMPUTE:
> + default:
> + break;
> + case PIPE_SHADER_VERTEX:
> + value = rctx->ps_shader->info.file_count[TGSI_FILE_HW_ATOMIC];
> + break;
> + case PIPE_SHADER_GEOMETRY:
> + value = rctx->ps_shader->info.file_count[TGSI_FILE_HW_ATOMIC] +
> + rctx->vs_shader->info.file_count[TGSI_FILE_HW_ATOMIC];
> + break;
> + case PIPE_SHADER_TESS_EVAL:
> + value = rctx->ps_shader->info.file_count[TGSI_FILE_HW_ATOMIC] +
> + rctx->vs_shader->info.file_count[TGSI_FILE_HW_ATOMIC] +
> + (rctx->gs_shader ? rctx->gs_shader->info.file_count[TGSI_FILE_HW_ATOMIC] : 0);
> + break;
> + case PIPE_SHADER_TESS_CTRL:
> + value = rctx->ps_shader->info.file_count[TGSI_FILE_HW_ATOMIC] +
> + rctx->vs_shader->info.file_count[TGSI_FILE_HW_ATOMIC] +
> + (rctx->gs_shader ? rctx->gs_shader->info.file_count[TGSI_FILE_HW_ATOMIC] : 0) +
> + rctx->tes_shader->info.file_count[TGSI_FILE_HW_ATOMIC];
> + break;
> + }
> + return value;
> +}
> +
> /* Compute the key for the hw shader variant */
> static inline void r600_shader_selector_key(const struct pipe_context *ctx,
> const struct r600_pipe_shader_selector *sel,
> @@ -716,11 +748,14 @@ static inline void r600_shader_selector_key(const struct pipe_context *ctx,
> key->vs.as_gs_a = true;
> key->vs.prim_id_out = rctx->ps_shader->current->shader.input[rctx->ps_shader->current->shader.ps_prim_id_input].spi_sid;
> }
> + key->vs.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_VERTEX);
> break;
> }
> case PIPE_SHADER_GEOMETRY:
> + key->gs.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_GEOMETRY);
> break;
> case PIPE_SHADER_FRAGMENT: {
> + key->ps.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_FRAGMENT);
> key->ps.color_two_side = rctx->rasterizer && rctx->rasterizer->two_side;
> key->ps.alpha_to_one = rctx->alpha_to_one &&
> rctx->rasterizer && rctx->rasterizer->multisample_enable &&
> @@ -733,9 +768,11 @@ static inline void r600_shader_selector_key(const struct pipe_context *ctx,
> }
> case PIPE_SHADER_TESS_EVAL:
> key->tes.as_es = (rctx->gs_shader != NULL);
> + key->tes.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_TESS_EVAL);
> break;
> case PIPE_SHADER_TESS_CTRL:
> key->tcs.prim_mode = rctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
> + key->tcs.first_atomic_counter = r600_get_hw_atomic_count(ctx, PIPE_SHADER_TESS_CTRL);
> break;
> default:
> assert(0);
> @@ -1700,6 +1737,8 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
> unsigned num_patches, dirty_tex_counter, index_offset = 0;
> unsigned index_size = info->index_size;
> int index_bias;
> + struct r600_shader_atomic combined_atomics[8];
> + uint8_t atomic_used_mask;
>
> if (!info->indirect && !info->count && (index_size || !info->count_from_stream_output)) {
> return;
> @@ -1739,6 +1778,9 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
> : (rctx->tes_shader)? rctx->tes_shader->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]
> : info->mode;
>
> + if (rctx->b.chip_class >= EVERGREEN)
> + evergreen_emit_atomic_buffer_setup(rctx, combined_atomics, &atomic_used_mask);
> +
> if (index_size) {
> index_offset += info->start * index_size;
>
> @@ -2019,6 +2061,10 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
> radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SQ_NON_EVENT));
> }
>
> +
> + if (rctx->b.chip_class >= EVERGREEN)
> + evergreen_emit_atomic_buffer_save(rctx, combined_atomics, &atomic_used_mask);
> +
> if (rctx->trace_buf)
> eg_trace_emit(rctx);
>
> diff --git a/src/gallium/drivers/r600/r600d_common.h b/src/gallium/drivers/r600/r600d_common.h
> index ed1d460..b06f90f 100644
> --- a/src/gallium/drivers/r600/r600d_common.h
> +++ b/src/gallium/drivers/r600/r600d_common.h
> @@ -51,6 +51,8 @@
> #define STRMOUT_SELECT_BUFFER(x) (((unsigned)(x) & 0x3) << 8)
> #define PKT3_WAIT_REG_MEM 0x3C
> #define WAIT_REG_MEM_EQUAL 3
> +#define WAIT_REG_MEM_GEQUAL 5
> +#define WAIT_REG_MEM_MEMORY (1 << 4)
> #define WAIT_REG_MEM_MEM_SPACE(x) (((unsigned)(x) & 0x3) << 4)
> #define PKT3_COPY_DATA 0x40
> #define COPY_DATA_SRC_SEL(x) ((x) & 0xf)
>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
More information about the mesa-dev
mailing list