[Mesa-dev] [PATCH 1/4] radeon/winsys: add dma ring support to winsys v3

j.glisse at gmail.com j.glisse at gmail.com
Fri Jan 25 09:50:47 PST 2013


From: Jerome Glisse <jglisse at redhat.com>

Add ring support, you can create a cs for each ring. DMA ring is
bit special regarding relocation as you must emit as much relocation
as there is use of the buffer.

v2: - Improved comment on relocation changes
    - Use a single thread to queue cs submittion this simplify driver
      code while not impacting performances. Rational for this is that
      you have to wait for all previous submission to have completed
      so there was never a case while we could have 2 different thread
      submitting a command stream at the same time. This code just
      consolidate submission into one single thread per winsys.
v3: - Do not use semaphore for empty queue signaling, instead use
      cond var. This is because it's tricky to maintain an even number
      of call to semaphore wait and semaphore signal (the number of
      cs in the stack would for instance make that number vary).

Signed-off-by: Jerome Glisse <jglisse at redhat.com>
---
 src/gallium/drivers/r300/r300_context.c           |   2 +-
 src/gallium/drivers/r600/r600_pipe.c              |   2 +-
 src/gallium/drivers/radeonsi/radeonsi_pipe.c      |   2 +-
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c     |   2 +-
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c     | 160 ++++++++++++----------
 src/gallium/winsys/radeon/drm/radeon_drm_cs.h     |   8 +-
 src/gallium/winsys/radeon/drm/radeon_drm_winsys.c |  87 ++++++++++++
 src/gallium/winsys/radeon/drm/radeon_drm_winsys.h |  17 +++
 src/gallium/winsys/radeon/drm/radeon_winsys.h     |  20 ++-
 9 files changed, 218 insertions(+), 82 deletions(-)

diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index d8af13f..340a7f0 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -379,7 +379,7 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
                      sizeof(struct pipe_transfer), 64,
                      UTIL_SLAB_SINGLETHREADED);
 
-    r300->cs = rws->cs_create(rws);
+    r300->cs = rws->cs_create(rws, RING_GFX);
     if (r300->cs == NULL)
         goto fail;
 
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index fda5074..e4a35cf 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -289,7 +289,7 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, void
 		goto fail;
 	}
 
-	rctx->cs = rctx->ws->cs_create(rctx->ws);
+	rctx->cs = rctx->ws->cs_create(rctx->ws, RING_GFX);
 	rctx->ws->cs_set_flush_callback(rctx->cs, r600_flush_from_winsys, rctx);
 
 	rctx->uploader = u_upload_create(&rctx->context, 1024 * 1024, 256,
diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.c b/src/gallium/drivers/radeonsi/radeonsi_pipe.c
index cbb3bc4..5792fe2 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_pipe.c
+++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.c
@@ -222,7 +222,7 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, void
 	case TAHITI:
 		si_init_state_functions(rctx);
 		LIST_INITHEAD(&rctx->active_query_list);
-		rctx->cs = rctx->ws->cs_create(rctx->ws);
+		rctx->cs = rctx->ws->cs_create(rctx->ws, RING_GFX);
 		rctx->max_db = 8;
 		si_init_config(rctx);
 		break;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 897e962..6daafc3 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -453,7 +453,7 @@ static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf,
                 } else {
                     /* Try to avoid busy-waiting in radeon_bo_wait. */
                     if (p_atomic_read(&bo->num_active_ioctls))
-                        radeon_drm_cs_sync_flush(cs);
+                        radeon_drm_cs_sync_flush(rcs);
                 }
 
                 radeon_bo_wait((struct pb_buffer*)bo, RADEON_USAGE_READWRITE);
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index c5e7f1e..cab2704 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -90,6 +90,10 @@
 #define RADEON_CS_RING_COMPUTE      1
 #endif
 
+#ifndef RADEON_CS_RING_DMA
+#define RADEON_CS_RING_DMA          2
+#endif
+
 #ifndef RADEON_CS_END_OF_FRAME
 #define RADEON_CS_END_OF_FRAME      0x04
 #endif
@@ -158,10 +162,8 @@ static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
     FREE(csc->relocs);
 }
 
-DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", TRUE)
-static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param);
 
-static struct radeon_winsys_cs *radeon_drm_cs_create(struct radeon_winsys *rws)
+static struct radeon_winsys_cs *radeon_drm_cs_create(struct radeon_winsys *rws, enum ring_type ring_type)
 {
     struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
     struct radeon_drm_cs *cs;
@@ -170,7 +172,6 @@ static struct radeon_winsys_cs *radeon_drm_cs_create(struct radeon_winsys *rws)
     if (!cs) {
         return NULL;
     }
-    pipe_semaphore_init(&cs->flush_queued, 0);
     pipe_semaphore_init(&cs->flush_completed, 0);
 
     cs->ws = ws;
@@ -189,10 +190,9 @@ static struct radeon_winsys_cs *radeon_drm_cs_create(struct radeon_winsys *rws)
     cs->csc = &cs->csc1;
     cs->cst = &cs->csc2;
     cs->base.buf = cs->csc->buf;
+    cs->base.ring_type = ring_type;
 
     p_atomic_inc(&ws->num_cs);
-    if (cs->ws->num_cpus > 1 && debug_get_option_thread())
-        cs->thread = pipe_thread_create(radeon_drm_cs_emit_ioctl, cs);
     return &cs->base;
 }
 
@@ -246,35 +246,49 @@ int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo)
     return -1;
 }
 
-static unsigned radeon_add_reloc(struct radeon_cs_context *csc,
+static unsigned radeon_add_reloc(struct radeon_drm_cs *cs,
                                  struct radeon_bo *bo,
                                  enum radeon_bo_usage usage,
                                  enum radeon_bo_domain domains,
                                  enum radeon_bo_domain *added_domains)
 {
+    struct radeon_cs_context *csc = cs->csc;
     struct drm_radeon_cs_reloc *reloc;
-    unsigned i;
     unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
     enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
     enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
+    bool update_hash = TRUE;
+    int i;
 
+    *added_domains = 0;
     if (csc->is_handle_added[hash]) {
         i = csc->reloc_indices_hashlist[hash];
         reloc = &csc->relocs[i];
-        if (reloc->handle == bo->handle) {
-            update_reloc_domains(reloc, rd, wd, added_domains);
-            return i;
+        if (reloc->handle != bo->handle) {
+            /* Hash collision, look for the BO in the list of relocs linearly. */
+            for (i = csc->crelocs - 1; i >= 0; i--) {
+                reloc = &csc->relocs[i];
+                if (reloc->handle == bo->handle) {
+                    /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
+                    break;
+                }
+            }
         }
 
-        /* Hash collision, look for the BO in the list of relocs linearly. */
-        for (i = csc->crelocs; i != 0;) {
-            --i;
-            reloc = &csc->relocs[i];
-            if (reloc->handle == bo->handle) {
-                update_reloc_domains(reloc, rd, wd, added_domains);
-
+        if (i >= 0) {
+            /* On DMA ring we need to emit as many relocation as there is use of the bo
+             * thus each time this function is call we should grow add again the bo to
+             * the relocation buffer
+             *
+             * Do not update the hash table if it's dma ring, so that first hash always point
+             * to first bo relocation which will the one used by the kernel. Following relocation
+             * will be ignore by the kernel memory placement (but still use by the kernel to
+             * update the cmd stream with proper buffer offset).
+             */
+            update_hash = FALSE;
+            update_reloc_domains(reloc, rd, wd, added_domains);
+            if (cs->base.ring_type != RING_DMA) {
                 csc->reloc_indices_hashlist[hash] = i;
-                /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
                 return i;
             }
         }
@@ -305,7 +319,9 @@ static unsigned radeon_add_reloc(struct radeon_cs_context *csc,
     reloc->flags = 0;
 
     csc->is_handle_added[hash] = TRUE;
-    csc->reloc_indices_hashlist[hash] = csc->crelocs;
+    if (update_hash) {
+        csc->reloc_indices_hashlist[hash] = csc->crelocs;
+    }
 
     csc->chunks[1].length_dw += RELOC_DWORDS;
 
@@ -321,8 +337,7 @@ static unsigned radeon_drm_cs_add_reloc(struct radeon_winsys_cs *rcs,
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
     struct radeon_bo *bo = (struct radeon_bo*)buf;
     enum radeon_bo_domain added_domains;
-
-    unsigned index = radeon_add_reloc(cs->csc, bo, usage, domains, &added_domains);
+    unsigned index = radeon_add_reloc(cs, bo, usage, domains, &added_domains);
 
     if (added_domains & RADEON_DOMAIN_GTT)
         cs->csc->used_gart += bo->base.size;
@@ -373,7 +388,6 @@ static void radeon_drm_cs_write_reloc(struct radeon_winsys_cs *rcs,
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
     struct radeon_bo *bo = (struct radeon_bo*)buf;
-
     unsigned index = radeon_get_reloc(cs->csc, bo);
 
     if (index == -1) {
@@ -385,7 +399,7 @@ static void radeon_drm_cs_write_reloc(struct radeon_winsys_cs *rcs,
     OUT_CS(&cs->base, index * RELOC_DWORDS);
 }
 
-static void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_cs_context *csc)
+void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_cs_context *csc)
 {
     unsigned i;
 
@@ -410,25 +424,15 @@ static void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_cs_context *csc)
     radeon_cs_context_cleanup(csc);
 }
 
-static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param)
+/*
+ * Make sure previous submission of this cs are completed
+ */
+void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs)
 {
-    struct radeon_drm_cs *cs = (struct radeon_drm_cs*)param;
-
-    while (1) {
-        pipe_semaphore_wait(&cs->flush_queued);
-        if (cs->kill_thread)
-            break;
-        radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
-        pipe_semaphore_signal(&cs->flush_completed);
-    }
-    pipe_semaphore_signal(&cs->flush_completed);
-    return NULL;
-}
+    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 
-void radeon_drm_cs_sync_flush(struct radeon_drm_cs *cs)
-{
     /* Wait for any pending ioctl to complete. */
-    if (cs->thread && cs->flush_started) {
+    if (cs->ws->thread && cs->flush_started) {
         pipe_semaphore_wait(&cs->flush_completed);
         cs->flush_started = 0;
     }
@@ -445,7 +449,7 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags)
        fprintf(stderr, "radeon: command stream overflowed\n");
     }
 
-    radeon_drm_cs_sync_flush(cs);
+    radeon_drm_cs_sync_flush(rcs);
 
     /* Flip command streams. */
     tmp = cs->csc;
@@ -453,8 +457,7 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags)
     cs->cst = tmp;
 
     /* If the CS is not empty or overflowed, emit it in a separate thread. */
-    if (cs->base.cdw && cs->base.cdw <= RADEON_MAX_CMDBUF_DWORDS &&
-	!debug_get_option_noop()) {
+    if (cs->base.cdw && cs->base.cdw <= RADEON_MAX_CMDBUF_DWORDS && !debug_get_option_noop()) {
         unsigned i, crelocs = cs->cst->crelocs;
 
         cs->cst->chunks[0].length_dw = cs->base.cdw;
@@ -464,31 +467,50 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags)
             p_atomic_inc(&cs->cst->relocs_bo[i]->num_active_ioctls);
         }
 
-        cs->cst->flags[0] = 0;
-        cs->cst->flags[1] = RADEON_CS_RING_GFX;
-        cs->cst->cs.num_chunks = 2;
-        if (flags & RADEON_FLUSH_KEEP_TILING_FLAGS) {
-            cs->cst->flags[0] |= RADEON_CS_KEEP_TILING_FLAGS;
-            cs->cst->cs.num_chunks = 3;
-        }
-        if (cs->ws->info.r600_virtual_address) {
-            cs->cst->flags[0] |= RADEON_CS_USE_VM;
-            cs->cst->cs.num_chunks = 3;
-        }
-        if (flags & RADEON_FLUSH_END_OF_FRAME) {
-            cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
-            cs->cst->cs.num_chunks = 3;
-        }
-        if (flags & RADEON_FLUSH_COMPUTE) {
-            cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
+        switch (cs->base.ring_type) {
+        case RING_DMA:
+            cs->cst->flags[0] = 0;
+            cs->cst->flags[1] = RADEON_CS_RING_DMA;
             cs->cst->cs.num_chunks = 3;
+            if (cs->ws->info.r600_virtual_address) {
+                cs->cst->flags[0] |= RADEON_CS_USE_VM;
+            }
+            break;
+        default:
+        case RING_GFX:
+            cs->cst->flags[0] = 0;
+            cs->cst->flags[1] = RADEON_CS_RING_GFX;
+            cs->cst->cs.num_chunks = 2;
+            if (flags & RADEON_FLUSH_KEEP_TILING_FLAGS) {
+                cs->cst->flags[0] |= RADEON_CS_KEEP_TILING_FLAGS;
+                cs->cst->cs.num_chunks = 3;
+            }
+            if (cs->ws->info.r600_virtual_address) {
+                cs->cst->flags[0] |= RADEON_CS_USE_VM;
+                cs->cst->cs.num_chunks = 3;
+            }
+            if (flags & RADEON_FLUSH_END_OF_FRAME) {
+                cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
+                cs->cst->cs.num_chunks = 3;
+            }
+            if (flags & RADEON_FLUSH_COMPUTE) {
+                cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
+                cs->cst->cs.num_chunks = 3;
+            }
+            break;
         }
 
-        if (cs->thread &&
-            (flags & RADEON_FLUSH_ASYNC)) {
+        if (cs->ws->thread && (flags & RADEON_FLUSH_ASYNC)) {
             cs->flush_started = 1;
-            pipe_semaphore_signal(&cs->flush_queued);
+            radeon_drm_ws_queue_cs(cs->ws, cs);
         } else {
+            pipe_mutex_lock(cs->ws->cs_stack_lock);
+            if (cs->ws->thread) {
+                while (p_atomic_read(&cs->ws->ncs)) {
+                    pipe_condvar_wait(cs->ws->cs_queue_empty, cs->ws->cs_stack_lock);
+                }
+            }
+            pipe_mutex_unlock(cs->ws->cs_stack_lock);
             radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
         }
     } else {
@@ -503,14 +525,8 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags)
 static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
-    radeon_drm_cs_sync_flush(cs);
-    if (cs->thread) {
-        cs->kill_thread = 1;
-        pipe_semaphore_signal(&cs->flush_queued);
-        pipe_semaphore_wait(&cs->flush_completed);
-        pipe_thread_wait(cs->thread);
-    }
-    pipe_semaphore_destroy(&cs->flush_queued);
+
+    radeon_drm_cs_sync_flush(rcs);
     pipe_semaphore_destroy(&cs->flush_completed);
     radeon_cs_context_cleanup(&cs->csc1);
     radeon_cs_context_cleanup(&cs->csc2);
@@ -525,6 +541,7 @@ static void radeon_drm_cs_set_flush(struct radeon_winsys_cs *rcs,
                                     void *user)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
+
     cs->flush_cs = flush;
     cs->flush_data = user;
 }
@@ -562,4 +579,5 @@ void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
     ws->base.cs_flush = radeon_drm_cs_flush;
     ws->base.cs_set_flush_callback = radeon_drm_cs_set_flush;
     ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
+    ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
 }
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
index a88fba5..570842d 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
@@ -74,9 +74,8 @@ struct radeon_drm_cs {
     void (*flush_cs)(void *ctx, unsigned flags);
     void *flush_data;
 
-    pipe_thread thread;
-    int flush_started, kill_thread;
-    pipe_semaphore flush_queued, flush_completed;
+    int flush_started;
+    pipe_semaphore flush_completed;
 };
 
 int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo);
@@ -118,7 +117,8 @@ radeon_bo_is_referenced_by_any_cs(struct radeon_bo *bo)
     return bo->num_cs_references != 0;
 }
 
-void radeon_drm_cs_sync_flush(struct radeon_drm_cs *cs);
+void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs);
 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws);
+void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_cs_context *csc);
 
 #endif
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index b8a876c..d23220d 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -316,6 +316,12 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws)
         break;
     }
 
+    /* Check for dma */
+    ws->info.r600_has_dma = FALSE;
+    if (ws->info.chip_class >= R700 && ws->info.drm_minor >= 27) {
+        ws->info.r600_has_dma = TRUE;
+    }
+
     /* Get GEM info. */
     retval = drmCommandWriteRead(ws->fd, DRM_RADEON_GEM_INFO,
             &gem_info, sizeof(gem_info));
@@ -389,12 +395,21 @@ static void radeon_winsys_destroy(struct radeon_winsys *rws)
 {
     struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)rws;
 
+    if (ws->thread) {
+        ws->kill_thread = 1;
+        pipe_semaphore_signal(&ws->cs_queued);
+        pipe_thread_wait(ws->thread);
+    }
+    pipe_semaphore_destroy(&ws->cs_queued);
+    pipe_condvar_destroy(ws->cs_queue_empty);
+
     if (!pipe_reference(&ws->base.reference, NULL)) {
         return;
     }
 
     pipe_mutex_destroy(ws->hyperz_owner_mutex);
     pipe_mutex_destroy(ws->cmask_owner_mutex);
+    pipe_mutex_destroy(ws->cs_stack_lock);
 
     ws->cman->destroy(ws->cman);
     ws->kman->destroy(ws->kman);
@@ -477,6 +492,71 @@ static int compare_fd(void *key1, void *key2)
     return pointer_to_intptr(key1) != pointer_to_intptr(key2);
 }
 
+void radeon_drm_ws_queue_cs(struct radeon_drm_winsys *ws, struct radeon_drm_cs *cs)
+{
+retry:
+    pipe_mutex_lock(ws->cs_stack_lock);
+    if (p_atomic_read(&ws->ncs) >= RING_LAST) {
+        /* no room left for a flush */
+        pipe_mutex_unlock(ws->cs_stack_lock);
+        goto retry;
+    }
+    ws->cs_stack[p_atomic_read(&ws->ncs)] = cs;
+    p_atomic_inc(&ws->ncs);
+    pipe_mutex_unlock(ws->cs_stack_lock);
+    pipe_semaphore_signal(&ws->cs_queued);
+}
+
+static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param)
+{
+    struct radeon_drm_winsys *ws = (struct radeon_drm_winsys *)param;
+    struct radeon_drm_cs *cs;
+    unsigned i, empty_stack;
+
+    while (1) {
+        pipe_semaphore_wait(&ws->cs_queued);
+        if (ws->kill_thread)
+            break;
+next:
+        pipe_mutex_lock(ws->cs_stack_lock);
+        cs = ws->cs_stack[0];
+        pipe_mutex_unlock(ws->cs_stack_lock);
+
+        if (cs) {
+            radeon_drm_cs_emit_ioctl_oneshot(cs->cst);
+
+            pipe_mutex_lock(ws->cs_stack_lock);
+            for (i = 1; i < p_atomic_read(&ws->ncs); i++) {
+                ws->cs_stack[i - 1] = ws->cs_stack[i];
+            }
+            ws->cs_stack[p_atomic_read(&ws->ncs) - 1] = NULL;
+            empty_stack = p_atomic_dec_zero(&ws->ncs);
+            if (empty_stack) {
+                pipe_condvar_signal(ws->cs_queue_empty);
+            }
+            pipe_mutex_unlock(ws->cs_stack_lock);
+
+            pipe_semaphore_signal(&cs->flush_completed);
+
+            if (!empty_stack) {
+                goto next;
+            }
+        }
+    }
+    pipe_mutex_lock(ws->cs_stack_lock);
+    for (i = 0; i < p_atomic_read(&ws->ncs); i++) {
+        pipe_semaphore_signal(&ws->cs_stack[i]->flush_completed);
+        ws->cs_stack[i] = NULL;
+    }
+    p_atomic_set(&ws->ncs, 0);
+    pipe_condvar_signal(ws->cs_queue_empty);
+    pipe_mutex_unlock(ws->cs_stack_lock);
+    return NULL;
+}
+
+DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", TRUE)
+static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_ioctl, param);
+
 struct radeon_winsys *radeon_drm_winsys_create(int fd)
 {
     struct radeon_drm_winsys *ws;
@@ -531,6 +611,13 @@ struct radeon_winsys *radeon_drm_winsys_create(int fd)
 
     pipe_mutex_init(ws->hyperz_owner_mutex);
     pipe_mutex_init(ws->cmask_owner_mutex);
+    pipe_mutex_init(ws->cs_stack_lock);
+
+    p_atomic_set(&ws->ncs, 0);
+    pipe_semaphore_init(&ws->cs_queued, 0);
+    pipe_condvar_init(ws->cs_queue_empty);
+    if (ws->num_cpus > 1 && debug_get_option_thread())
+        ws->thread = pipe_thread_create(radeon_drm_cs_emit_ioctl, ws);
 
     return &ws->base;
 
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
index e714127..74eb408 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
@@ -33,6 +33,8 @@
 #include "radeon_winsys.h"
 #include "os/os_thread.h"
 
+struct radeon_drm_cs;
+
 enum radeon_generation {
     DRV_R300,
     DRV_R600,
@@ -58,6 +60,19 @@ struct radeon_drm_winsys {
     pipe_mutex hyperz_owner_mutex;
     struct radeon_drm_cs *cmask_owner;
     pipe_mutex cmask_owner_mutex;
+
+    /* rings submission thread */
+    pipe_mutex cs_stack_lock;
+    pipe_semaphore cs_queued;
+    /* we cannot use semaphore for empty queue because maintaining an even
+     * number of call to semaphore_wait and semaphore_signal is, to say the
+     * least, tricky
+     */
+    pipe_condvar cs_queue_empty;
+    pipe_thread thread;
+    int kill_thread;
+    int ncs;
+    struct radeon_drm_cs *cs_stack[RING_LAST];
 };
 
 static INLINE struct radeon_drm_winsys *
@@ -66,4 +81,6 @@ radeon_drm_winsys(struct radeon_winsys *base)
     return (struct radeon_drm_winsys*)base;
 }
 
+void radeon_drm_ws_queue_cs(struct radeon_drm_winsys *ws, struct radeon_drm_cs *cs);
+
 #endif
diff --git a/src/gallium/winsys/radeon/drm/radeon_winsys.h b/src/gallium/winsys/radeon/drm/radeon_winsys.h
index d0c4822..7fdef3f 100644
--- a/src/gallium/winsys/radeon/drm/radeon_winsys.h
+++ b/src/gallium/winsys/radeon/drm/radeon_winsys.h
@@ -138,12 +138,19 @@ enum chip_class {
     TAHITI,
 };
 
+enum ring_type {
+    RING_GFX = 0,
+    RING_DMA,
+    RING_LAST,
+};
+
 struct winsys_handle;
 struct radeon_winsys_cs_handle;
 
 struct radeon_winsys_cs {
-    unsigned cdw;  /* Number of used dwords. */
-    uint32_t *buf; /* The command buffer. */
+    unsigned                    cdw;  /* Number of used dwords. */
+    uint32_t                    *buf; /* The command buffer. */
+    enum ring_type              ring_type;
 };
 
 struct radeon_info {
@@ -170,6 +177,7 @@ struct radeon_info {
     uint32_t                    r600_max_pipes;
     boolean                     r600_backend_map_valid;
     boolean                     r600_virtual_address;
+    boolean                     r600_has_dma;
 };
 
 enum radeon_feature_id {
@@ -350,7 +358,7 @@ struct radeon_winsys {
      *
      * \param ws        The winsys this function is called from.
      */
-    struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys *ws);
+    struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys *ws, enum ring_type ring_type);
 
     /**
      * Destroy a command stream.
@@ -433,6 +441,12 @@ struct radeon_winsys {
     boolean (*cs_request_feature)(struct radeon_winsys_cs *cs,
                                   enum radeon_feature_id fid,
                                   boolean enable);
+     /**
+      * Make sure all asynchronous flush of the cs have completed
+      *
+      * \param cs        A command stream.
+      */
+    void (*cs_sync_flush)(struct radeon_winsys_cs *cs);
 
     /**
      * Initialize surface
-- 
1.7.11.7



More information about the mesa-dev mailing list