[Mesa-dev] [PATCH 08/26] winsys/amdgpu: add a parallel compute IB coupled with a gfx IB
Marek Olšák
maraeo at gmail.com
Wed Feb 13 05:16:03 UTC 2019
From: Marek Olšák <marek.olsak at amd.com>
---
src/amd/common/ac_gpu_info.c | 6 +
src/amd/common/ac_gpu_info.h | 2 +
src/gallium/drivers/r600/r600_pipe_common.c | 4 +-
src/gallium/drivers/radeon/radeon_winsys.h | 36 ++-
src/gallium/drivers/radeonsi/si_fence.c | 4 +-
src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 214 +++++++++++++++++-
src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 13 ++
src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 3 +-
8 files changed, 272 insertions(+), 10 deletions(-)
diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c
index 6971e4f0a8e..4d9f6afca01 100644
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -398,6 +398,7 @@ bool ac_query_gpu_info(int fd, amdgpu_device_handle dev,
info->drm_minor >= 13;
info->has_2d_tiling = true;
info->has_read_registers_query = true;
+ info->has_scheduled_fence_dependency = info->drm_minor >= 28;
info->num_render_backends = amdinfo->rb_pipes;
/* The value returned by the kernel driver was wrong. */
@@ -463,6 +464,9 @@ bool ac_query_gpu_info(int fd, amdgpu_device_handle dev,
assert(ib_align);
info->ib_start_alignment = ib_align;
+ info->has_gds_ordered_append = info->chip_class >= CIK &&
+ info->drm_minor >= 29 &&
+ HAVE_LLVM >= 0x0800;
return true;
}
@@ -562,6 +566,8 @@ void ac_print_gpu_info(struct radeon_info *info)
printf(" has_sparse_vm_mappings = %u\n", info->has_sparse_vm_mappings);
printf(" has_2d_tiling = %u\n", info->has_2d_tiling);
printf(" has_read_registers_query = %u\n", info->has_read_registers_query);
+ printf(" has_gds_ordered_append = %u\n", info->has_gds_ordered_append);
+ printf(" has_scheduled_fence_dependency = %u\n", info->has_scheduled_fence_dependency);
printf("Shader core info:\n");
printf(" max_shader_clock = %i\n", info->max_shader_clock);
diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h
index 2c2389eaaa7..bb6984451e7 100644
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@@ -113,6 +113,8 @@ struct radeon_info {
bool has_sparse_vm_mappings;
bool has_2d_tiling;
bool has_read_registers_query;
+ bool has_gds_ordered_append;
+ bool has_scheduled_fence_dependency;
/* Shader cores. */
uint32_t r600_max_quad_pipes; /* wave size / 16 */
diff --git a/src/gallium/drivers/r600/r600_pipe_common.c b/src/gallium/drivers/r600/r600_pipe_common.c
index abfa250435d..3c00ad691ac 100644
--- a/src/gallium/drivers/r600/r600_pipe_common.c
+++ b/src/gallium/drivers/r600/r600_pipe_common.c
@@ -355,8 +355,8 @@ static void r600_add_fence_dependency(struct r600_common_context *rctx,
struct radeon_winsys *ws = rctx->ws;
if (rctx->dma.cs)
- ws->cs_add_fence_dependency(rctx->dma.cs, fence);
- ws->cs_add_fence_dependency(rctx->gfx.cs, fence);
+ ws->cs_add_fence_dependency(rctx->dma.cs, fence, 0);
+ ws->cs_add_fence_dependency(rctx->gfx.cs, fence, 0);
}
static void r600_fence_server_sync(struct pipe_context *ctx,
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index aec91c8d002..c04c014bd2f 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -67,6 +67,16 @@ enum radeon_bo_flag { /* bitfield */
RADEON_FLAG_32BIT = (1 << 6),
};
+enum radeon_dependency_flag {
+ /* Add the dependency to the parallel compute IB only. */
+ RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY = 1 << 0,
+
+ /* Instead of waiting for a job to finish execution, the dependency will
+ * be signaled when the job starts execution.
+ */
+ RADEON_DEPENDENCY_START_FENCE = 1 << 1,
+};
+
enum radeon_bo_usage { /* bitfield */
RADEON_USAGE_READ = 2,
RADEON_USAGE_WRITE = 4,
@@ -486,6 +496,27 @@ struct radeon_winsys {
void *flush_ctx,
bool stop_exec_on_failure);
+ /**
+ * Add a parallel compute IB to a gfx IB. It will share the buffer list
+ * and fence dependencies with the gfx IB. The gfx flush call will submit
+ * both IBs at the same time.
+ *
+ * The compute IB doesn't have an output fence, so the primary IB has
+ * to use a wait packet for synchronization.
+ *
+ * The returned IB is only a stream for writing packets to the new
+ * IB. Calling other winsys functions with it is not allowed, not even
+ * "cs_destroy". Use the gfx IB instead.
+ *
+ * \param cs Gfx IB
+ * \param gds_size Number of GDS memory that will be available for this IB.
+ * \param num_oa_counters Number of GDS ordered append counters that will
+ * be available for this IB.
+ */
+ struct radeon_cmdbuf *(*cs_add_parallel_compute_ib)(struct radeon_cmdbuf *cs,
+ unsigned gds_size,
+ unsigned num_oa_counters);
+
/**
* Destroy a command stream.
*
@@ -608,9 +639,12 @@ struct radeon_winsys {
/**
* Add a fence dependency to the CS, so that the CS will wait for
* the fence before execution.
+ *
+ * \param dependency_flags Bitmask of RADEON_DEPENDENCY_*
*/
void (*cs_add_fence_dependency)(struct radeon_cmdbuf *cs,
- struct pipe_fence_handle *fence);
+ struct pipe_fence_handle *fence,
+ unsigned dependency_flags);
/**
* Signal a syncobj when the CS finishes execution.
diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c
index 3d23597413c..e3c1e0959fd 100644
--- a/src/gallium/drivers/radeonsi/si_fence.c
+++ b/src/gallium/drivers/radeonsi/si_fence.c
@@ -178,8 +178,8 @@ static void si_add_fence_dependency(struct si_context *sctx,
struct radeon_winsys *ws = sctx->ws;
if (sctx->dma_cs)
- ws->cs_add_fence_dependency(sctx->dma_cs, fence);
- ws->cs_add_fence_dependency(sctx->gfx_cs, fence);
+ ws->cs_add_fence_dependency(sctx->dma_cs, fence, 0);
+ ws->cs_add_fence_dependency(sctx->gfx_cs, fence, 0);
}
static void si_add_syncobj_signal(struct si_context *sctx,
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 4a588d52930..1438b1ffe76 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -35,6 +35,14 @@
DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
+#ifndef AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID
+#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4)
+#endif
+
+#ifndef AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES
+#define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES 0x07
+#endif
+
/* FENCES */
static struct pipe_fence_handle *
@@ -717,6 +725,7 @@ static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib,
static unsigned amdgpu_ib_max_submit_dwords(enum ib_type ib_type)
{
+ /* The maximum IB size including all chained IBs. */
switch (ib_type) {
case IB_MAIN:
/* Smaller submits means the GPU gets busy sooner and there is less
@@ -724,6 +733,9 @@ static unsigned amdgpu_ib_max_submit_dwords(enum ib_type ib_type)
* http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1
*/
return 20 * 1024;
+ case IB_PARALLEL_COMPUTE:
+ /* Always chain this IB. */
+ return UINT_MAX;
default:
unreachable("bad ib_type");
}
@@ -739,12 +751,15 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs,
*/
struct amdgpu_ib *ib = NULL;
struct drm_amdgpu_cs_chunk_ib *info = &cs->csc->ib[ib_type];
- unsigned ib_size = 0;
+ /* This is the minimum size of a contiguous IB. */
+ unsigned ib_size = 4 * 1024 * 4;
switch (ib_type) {
+ case IB_PARALLEL_COMPUTE:
+ ib = &cs->compute_ib;
+ break;
case IB_MAIN:
ib = &cs->main;
- ib_size = 4 * 1024 * 4;
break;
default:
unreachable("unhandled IB type");
@@ -866,6 +881,9 @@ static bool amdgpu_init_cs_context(struct amdgpu_winsys *ws,
assert(0);
}
+ cs->ib[IB_PARALLEL_COMPUTE].ip_type = AMDGPU_HW_IP_COMPUTE;
+ cs->ib[IB_PARALLEL_COMPUTE].flags = AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE;
+
memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
cs->last_added_bo = NULL;
return true;
@@ -897,6 +915,8 @@ static void amdgpu_cs_context_cleanup(struct amdgpu_cs_context *cs)
cleanup_fence_list(&cs->fence_dependencies);
cleanup_fence_list(&cs->syncobj_dependencies);
cleanup_fence_list(&cs->syncobj_to_signal);
+ cleanup_fence_list(&cs->compute_fence_dependencies);
+ cleanup_fence_list(&cs->compute_start_fence_dependencies);
cs->num_real_buffers = 0;
cs->num_slab_buffers = 0;
@@ -916,6 +936,8 @@ static void amdgpu_destroy_cs_context(struct amdgpu_cs_context *cs)
FREE(cs->fence_dependencies.list);
FREE(cs->syncobj_dependencies.list);
FREE(cs->syncobj_to_signal.list);
+ FREE(cs->compute_fence_dependencies.list);
+ FREE(cs->compute_start_fence_dependencies.list);
}
@@ -949,6 +971,7 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
amdgpu_cs_chunk_fence_info_to_data(&fence_info, (void*)&cs->fence_chunk);
cs->main.ib_type = IB_MAIN;
+ cs->compute_ib.ib_type = IB_PARALLEL_COMPUTE;
if (!amdgpu_init_cs_context(ctx->ws, &cs->csc1, ring_type)) {
FREE(cs);
@@ -976,6 +999,77 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
return &cs->main.base;
}
+static struct radeon_cmdbuf *
+amdgpu_cs_add_parallel_compute_ib(struct radeon_cmdbuf *ib,
+ unsigned gds_size, unsigned num_oa_counters)
+{
+ struct amdgpu_cs *cs = (struct amdgpu_cs*)ib;
+ struct amdgpu_winsys *ws = cs->ctx->ws;
+
+ if (cs->ring_type != RING_GFX)
+ return NULL;
+
+ /* only one secondary IB can be added */
+ if (cs->compute_ib.ib_mapped)
+ return NULL;
+
+ assert(gds_size || !num_oa_counters); /* OA requires GDS */
+
+ amdgpu_bo_handle gds_mem = NULL, gds_oa = NULL;
+
+ /* Optionally allocate GDS resources for the IB. */
+ if (gds_size) {
+ struct amdgpu_bo_alloc_request gds_mem_info = {0}, gds_oa_info = {0};
+
+ gds_mem_info.alloc_size = gds_size;
+ gds_mem_info.phys_alignment = 4;
+ gds_mem_info.preferred_heap = AMDGPU_GEM_DOMAIN_GDS;
+
+ gds_oa_info.alloc_size = num_oa_counters;
+ gds_oa_info.phys_alignment = 1;
+ gds_oa_info.preferred_heap = AMDGPU_GEM_DOMAIN_OA;
+
+ if (amdgpu_bo_alloc(ws->dev, &gds_mem_info, &gds_mem)) {
+ fprintf(stderr, "amdgpu: Failed to create a GDS memory buffer.\n");
+ return NULL;
+ }
+
+ if (num_oa_counters &&
+ amdgpu_bo_alloc(ws->dev, &gds_oa_info, &gds_oa)) {
+ fprintf(stderr, "amdgpu: Failed to create a GDS OA buffer.\n");
+ amdgpu_bo_free(gds_mem);
+ return NULL;
+ }
+ }
+
+ /* Allocate the compute IB. */
+ if (!amdgpu_get_new_ib(&ws->base, cs, IB_PARALLEL_COMPUTE)) {
+ if (gds_mem)
+ amdgpu_bo_free(gds_mem);
+ if (gds_oa)
+ amdgpu_bo_free(gds_oa);
+ return NULL;
+ }
+
+ if (gds_mem) {
+ cs->compute_gds_mem = gds_mem;
+ cs->compute_gds_oa = gds_oa;
+
+ amdgpu_bo_export(gds_mem, amdgpu_bo_handle_type_kms,
+ &cs->compute_gds_mem_kms_handle);
+ if (gds_oa) {
+ amdgpu_bo_export(gds_oa, amdgpu_bo_handle_type_kms,
+ &cs->compute_gds_oa_kms_handle);
+
+ cs->csc1.ib[IB_PARALLEL_COMPUTE].flags |=
+ AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID;
+ cs->csc2.ib[IB_PARALLEL_COMPUTE].flags |=
+ AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID;
+ }
+ }
+ return &cs->compute_ib.base;
+}
+
static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs)
{
return true;
@@ -1105,6 +1199,11 @@ static void add_fence_to_list(struct amdgpu_fence_list *fences,
amdgpu_fence_reference(&fences->list[idx], (struct pipe_fence_handle*)fence);
}
+/* TODO: recognizing dependencies as no-ops doesn't take the parallel
+ * compute IB into account. The compute IB won't wait for these.
+ * Also, the scheduler can execute compute and SDMA IBs on any rings.
+ * Should we always insert dependencies?
+ */
static bool is_noop_fence_dependency(struct amdgpu_cs *acs,
struct amdgpu_fence *fence)
{
@@ -1121,7 +1220,8 @@ static bool is_noop_fence_dependency(struct amdgpu_cs *acs,
}
static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rws,
- struct pipe_fence_handle *pfence)
+ struct pipe_fence_handle *pfence,
+ unsigned dependency_flags)
{
struct amdgpu_cs *acs = amdgpu_cs(rws);
struct amdgpu_cs_context *cs = acs->csc;
@@ -1129,6 +1229,21 @@ static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rws,
util_queue_fence_wait(&fence->submitted);
+ if (dependency_flags & RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY) {
+ /* Syncobjs are not needed here. */
+ assert(!amdgpu_fence_is_syncobj(fence));
+
+ if (acs->ctx->ws->info.has_scheduled_fence_dependency &&
+ dependency_flags & RADEON_DEPENDENCY_START_FENCE)
+ add_fence_to_list(&cs->compute_start_fence_dependencies, fence);
+ else
+ add_fence_to_list(&cs->compute_fence_dependencies, fence);
+ return;
+ }
+
+ /* Start fences are not needed here. */
+ assert(!(dependency_flags & RADEON_DEPENDENCY_START_FENCE));
+
if (is_noop_fence_dependency(acs, fence))
return;
@@ -1325,7 +1440,7 @@ void amdgpu_cs_submit_ib(void *job, int thread_index)
}
struct drm_amdgpu_bo_list_entry *list =
- alloca(cs->num_real_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
+ alloca((cs->num_real_buffers + 2) * sizeof(struct drm_amdgpu_bo_list_entry));
unsigned num_handles = 0;
for (i = 0; i < cs->num_real_buffers; ++i) {
@@ -1341,6 +1456,18 @@ void amdgpu_cs_submit_ib(void *job, int thread_index)
++num_handles;
}
+ if (acs->compute_gds_mem) {
+ list[num_handles].bo_handle = acs->compute_gds_mem_kms_handle;
+ list[num_handles].bo_priority = 0;
+ ++num_handles;
+
+ if (acs->compute_gds_oa) {
+ list[num_handles].bo_handle = acs->compute_gds_oa_kms_handle;
+ list[num_handles].bo_priority = 0;
+ ++num_handles;
+ }
+ }
+
if (use_bo_list_create) {
/* Legacy path creating the buffer list handle and passing it to the CS ioctl. */
r = amdgpu_bo_list_create_raw(ws->dev, num_handles, list, &bo_list);
@@ -1418,6 +1545,66 @@ void amdgpu_cs_submit_ib(void *job, int thread_index)
num_chunks++;
}
+ /* Submit the parallel compute IB first. */
+ if (cs->ib[IB_PARALLEL_COMPUTE].ib_bytes > 0) {
+ unsigned old_num_chunks = num_chunks;
+
+ /* Add compute fence dependencies. */
+ unsigned num_dependencies = cs->compute_fence_dependencies.num;
+ if (num_dependencies) {
+ struct drm_amdgpu_cs_chunk_dep *dep_chunk =
+ alloca(num_dependencies * sizeof(*dep_chunk));
+
+ for (unsigned i = 0; i < num_dependencies; i++) {
+ struct amdgpu_fence *fence =
+ (struct amdgpu_fence*)cs->compute_fence_dependencies.list[i];
+
+ assert(util_queue_fence_is_signalled(&fence->submitted));
+ amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[i]);
+ }
+
+ chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES;
+ chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num_dependencies;
+ chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk;
+ num_chunks++;
+ }
+
+ /* Add compute start fence dependencies. */
+ unsigned num_start_dependencies = cs->compute_start_fence_dependencies.num;
+ if (num_start_dependencies) {
+ struct drm_amdgpu_cs_chunk_dep *dep_chunk =
+ alloca(num_start_dependencies * sizeof(*dep_chunk));
+
+ for (unsigned i = 0; i < num_start_dependencies; i++) {
+ struct amdgpu_fence *fence =
+ (struct amdgpu_fence*)cs->compute_start_fence_dependencies.list[i];
+
+ assert(util_queue_fence_is_signalled(&fence->submitted));
+ amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[i]);
+ }
+
+ chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES;
+ chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num_start_dependencies;
+ chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk;
+ num_chunks++;
+ }
+
+ /* Convert from dwords to bytes. */
+ cs->ib[IB_PARALLEL_COMPUTE].ib_bytes *= 4;
+ chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
+ chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
+ chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_PARALLEL_COMPUTE];
+ num_chunks++;
+
+ r = amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
+ num_chunks, chunks, NULL);
+ if (r)
+ goto finalize;
+
+ /* Back off the compute chunks. */
+ num_chunks = old_num_chunks;
+ }
+
/* Syncobj signals. */
unsigned num_syncobj_to_signal = cs->syncobj_to_signal.num;
if (num_syncobj_to_signal) {
@@ -1459,6 +1646,7 @@ void amdgpu_cs_submit_ib(void *job, int thread_index)
r = amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
num_chunks, chunks, &seq_no);
}
+finalize:
if (r) {
if (r == -ENOMEM)
@@ -1544,6 +1732,12 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
}
if (cs->ring_type == RING_GFX)
ws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4;
+
+ /* Also pad secondary IBs. */
+ if (cs->compute_ib.ib_mapped) {
+ while (cs->compute_ib.base.current.cdw & 7)
+ radeon_emit(&cs->compute_ib.base, 0xffff1000); /* type3 nop packet */
+ }
break;
case RING_UVD:
case RING_UVD_ENC:
@@ -1579,6 +1773,9 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
/* Set IB sizes. */
amdgpu_ib_finalize(ws, &cs->main);
+ if (cs->compute_ib.ib_mapped)
+ amdgpu_ib_finalize(ws, &cs->compute_ib);
+
/* Create a fence. */
amdgpu_fence_reference(&cur->fence, NULL);
if (cs->next_fence) {
@@ -1624,6 +1821,8 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
}
amdgpu_get_new_ib(&ws->base, cs, IB_MAIN);
+ if (cs->compute_ib.ib_mapped)
+ amdgpu_get_new_ib(&ws->base, cs, IB_PARALLEL_COMPUTE);
cs->main.base.used_gart = 0;
cs->main.base.used_vram = 0;
@@ -1645,9 +1844,15 @@ static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
p_atomic_dec(&cs->ctx->ws->num_cs);
pb_reference(&cs->main.big_ib_buffer, NULL);
FREE(cs->main.base.prev);
+ pb_reference(&cs->compute_ib.big_ib_buffer, NULL);
+ FREE(cs->compute_ib.base.prev);
amdgpu_destroy_cs_context(&cs->csc1);
amdgpu_destroy_cs_context(&cs->csc2);
amdgpu_fence_reference(&cs->next_fence, NULL);
+ if (cs->compute_gds_mem)
+ amdgpu_bo_free(cs->compute_gds_mem);
+ if (cs->compute_gds_oa)
+ amdgpu_bo_free(cs->compute_gds_oa);
FREE(cs);
}
@@ -1667,6 +1872,7 @@ void amdgpu_cs_init_functions(struct amdgpu_winsys *ws)
ws->base.ctx_destroy = amdgpu_ctx_destroy;
ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
ws->base.cs_create = amdgpu_cs_create;
+ ws->base.cs_add_parallel_compute_ib = amdgpu_cs_add_parallel_compute_ib;
ws->base.cs_destroy = amdgpu_cs_destroy;
ws->base.cs_add_buffer = amdgpu_cs_add_buffer;
ws->base.cs_validate = amdgpu_cs_validate;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
index 4f49a9065c6..474f4211b8e 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -57,6 +57,7 @@ struct amdgpu_cs_buffer {
enum ib_type {
IB_MAIN,
+ IB_PARALLEL_COMPUTE,
IB_NUM,
};
@@ -113,6 +114,10 @@ struct amdgpu_cs_context {
struct amdgpu_fence_list syncobj_dependencies;
struct amdgpu_fence_list syncobj_to_signal;
+ /* The compute IB uses the dependencies above + these: */
+ struct amdgpu_fence_list compute_fence_dependencies;
+ struct amdgpu_fence_list compute_start_fence_dependencies;
+
struct pipe_fence_handle *fence;
/* the error returned from cs_flush for non-async submissions */
@@ -121,6 +126,7 @@ struct amdgpu_cs_context {
struct amdgpu_cs {
struct amdgpu_ib main; /* must be first because this is inherited */
+ struct amdgpu_ib compute_ib; /* optional parallel compute IB */
struct amdgpu_ctx *ctx;
enum ring_type ring_type;
struct drm_amdgpu_cs_chunk_fence fence_chunk;
@@ -142,6 +148,11 @@ struct amdgpu_cs {
struct util_queue_fence flush_completed;
struct pipe_fence_handle *next_fence;
+
+ amdgpu_bo_handle compute_gds_mem;
+ amdgpu_bo_handle compute_gds_oa;
+ uint32_t compute_gds_mem_kms_handle;
+ uint32_t compute_gds_oa_kms_handle;
};
struct amdgpu_fence {
@@ -220,6 +231,8 @@ amdgpu_cs_from_ib(struct amdgpu_ib *ib)
switch (ib->ib_type) {
case IB_MAIN:
return get_container(ib, struct amdgpu_cs, main);
+ case IB_PARALLEL_COMPUTE:
+ return get_container(ib, struct amdgpu_cs, compute_ib);
default:
unreachable("bad ib_type");
}
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 490c246d6e0..2288c320975 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -798,7 +798,8 @@ radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
static void
radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
- struct pipe_fence_handle *fence)
+ struct pipe_fence_handle *fence,
+ unsigned dependency_flags)
{
/* TODO: Handle the following unlikely multi-threaded scenario:
*
--
2.17.1
More information about the mesa-dev
mailing list