<div dir="ltr"><div>For the series:</div><div><br></div><div><div>Reviewed-by: Marek Olšák <<a href="mailto:marek.olsak@amd.com">marek.olsak@amd.com</a>></div><div><br></div><div>Marek</div></div></div><br><div class="gmail_quote"><div dir="ltr">On Thu, Dec 6, 2018 at 7:57 AM Nicolai Hähnle <<a href="mailto:nhaehnle@gmail.com">nhaehnle@gmail.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">From: Nicolai Hähnle <<a href="mailto:nicolai.haehnle@amd.com" target="_blank">nicolai.haehnle@amd.com</a>><br>
<br>
The following race condition could occur in the no-timeout case:<br>
<br>
API thread Gallium thread Watchdog<br>
---------- -------------- --------<br>
dd_before_draw<br>
u_threaded_context draw<br>
dd_after_draw<br>
add to dctx->records<br>
signal watchdog<br>
dump & destroy record<br>
execute draw<br>
dd_after_draw_async<br>
use-after-free!<br>
<br>
Alternatively, the same scenario would assert in a debug build when<br>
destroying the record because record->driver_finished has not signaled.<br>
<br>
Fix this and simplify the logic at the same time by<br>
- handing the record pointers off to the watchdog thread *before* each<br>
draw call and<br>
- waiting on the driver_finished fence in the watchdog thread<br>
---<br>
.../auxiliary/driver_ddebug/dd_context.c | 1 -<br>
src/gallium/auxiliary/driver_ddebug/dd_draw.c | 103 ++++++++----------<br>
src/gallium/auxiliary/driver_ddebug/dd_pipe.h | 21 ++--<br>
3 files changed, 52 insertions(+), 73 deletions(-)<br>
<br>
diff --git a/src/gallium/auxiliary/driver_ddebug/dd_context.c b/src/gallium/auxiliary/driver_ddebug/dd_context.c<br>
index a9ac6ef14ab..15efeccf879 100644<br>
--- a/src/gallium/auxiliary/driver_ddebug/dd_context.c<br>
+++ b/src/gallium/auxiliary/driver_ddebug/dd_context.c<br>
@@ -589,21 +589,20 @@ static void<br>
dd_context_destroy(struct pipe_context *_pipe)<br>
{<br>
struct dd_context *dctx = dd_context(_pipe);<br>
struct pipe_context *pipe = dctx->pipe;<br>
<br>
dd_thread_join(dctx);<br>
mtx_destroy(&dctx->mutex);<br>
cnd_destroy(&dctx->cond);<br>
<br>
assert(list_empty(&dctx->records));<br>
- assert(!dctx->record_pending);<br>
<br>
if (pipe->set_log_context) {<br>
pipe->set_log_context(pipe, NULL);<br>
<br>
if (dd_screen(dctx->base.screen)->dump_mode == DD_DUMP_ALL_CALLS) {<br>
FILE *f = dd_get_file_stream(dd_screen(dctx->base.screen), 0);<br>
if (f) {<br>
fprintf(f, "Remainder of driver log:\n\n");<br>
}<br>
<br>
diff --git a/src/gallium/auxiliary/driver_ddebug/dd_draw.c b/src/gallium/auxiliary/driver_ddebug/dd_draw.c<br>
index cb5db8ab83b..a930299ebb7 100644<br>
--- a/src/gallium/auxiliary/driver_ddebug/dd_draw.c<br>
+++ b/src/gallium/auxiliary/driver_ddebug/dd_draw.c<br>
@@ -981,80 +981,75 @@ dd_report_hang(struct dd_context *dctx)<br>
}<br>
<br>
fclose(f);<br>
}<br>
<br>
if (top_not_reached)<br>
stop_output = true;<br>
encountered_hang = true;<br>
}<br>
<br>
- if (num_later || dctx->record_pending) {<br>
- fprintf(stderr, "... and %u%s additional draws.\n", num_later,<br>
- dctx->record_pending ? "+1 (pending)" : "");<br>
- }<br>
+ if (num_later)<br>
+ fprintf(stderr, "... and %u additional draws.\n", num_later);<br>
<br>
fprintf(stderr, "\nDone.\n");<br>
dd_kill_process();<br>
}<br>
<br>
int<br>
dd_thread_main(void *input)<br>
{<br>
struct dd_context *dctx = (struct dd_context *)input;<br>
struct dd_screen *dscreen = dd_screen(dctx->base.screen);<br>
struct pipe_screen *screen = dscreen->screen;<br>
<br>
mtx_lock(&dctx->mutex);<br>
<br>
for (;;) {<br>
struct list_head records;<br>
- struct pipe_fence_handle *fence;<br>
- struct pipe_fence_handle *fence2 = NULL;<br>
-<br>
list_replace(&dctx->records, &records);<br>
list_inithead(&dctx->records);<br>
dctx->num_records = 0;<br>
<br>
if (dctx->api_stalled)<br>
cnd_signal(&dctx->cond);<br>
<br>
- if (!list_empty(&records)) {<br>
- /* Wait for the youngest draw. This means hangs can take a bit longer<br>
- * to detect, but it's more efficient this way. */<br>
- struct dd_draw_record *youngest =<br>
- LIST_ENTRY(struct dd_draw_record, records.prev, list);<br>
- fence = youngest->bottom_of_pipe;<br>
- } else if (dctx->record_pending) {<br>
- /* Wait for pending fences, in case the driver ends up hanging internally. */<br>
- fence = dctx->record_pending->prev_bottom_of_pipe;<br>
- fence2 = dctx->record_pending->top_of_pipe;<br>
- } else if (dctx->kill_thread) {<br>
- break;<br>
- } else {<br>
+ if (list_empty(&records)) {<br>
+ if (dctx->kill_thread)<br>
+ break;<br>
+<br>
cnd_wait(&dctx->cond, &dctx->mutex);<br>
continue;<br>
}<br>
+<br>
mtx_unlock(&dctx->mutex);<br>
<br>
- /* Fences can be NULL legitimately when timeout detection is disabled. */<br>
- if ((fence &&<br>
- !screen->fence_finish(screen, NULL, fence,<br>
- (uint64_t)dscreen->timeout_ms * 1000*1000)) ||<br>
- (fence2 &&<br>
- !screen->fence_finish(screen, NULL, fence2,<br>
- (uint64_t)dscreen->timeout_ms * 1000*1000))) {<br>
- mtx_lock(&dctx->mutex);<br>
- list_splice(&records, &dctx->records);<br>
- dd_report_hang(dctx);<br>
- /* we won't actually get here */<br>
- mtx_unlock(&dctx->mutex);<br>
+ /* Wait for the youngest draw. This means hangs can take a bit longer<br>
+ * to detect, but it's more efficient this way. */<br>
+ struct dd_draw_record *youngest =<br>
+ list_last_entry(&records, struct dd_draw_record, list);<br>
+<br>
+ if (dscreen->timeout_ms > 0) {<br>
+ uint64_t abs_timeout = os_time_get_absolute_timeout(<br>
+ (uint64_t)dscreen->timeout_ms * 1000*1000);<br>
+<br>
+ if (!util_queue_fence_wait_timeout(&youngest->driver_finished, abs_timeout) ||<br>
+ !screen->fence_finish(screen, NULL, youngest->bottom_of_pipe,<br>
+ (uint64_t)dscreen->timeout_ms * 1000*1000)) {<br>
+ mtx_lock(&dctx->mutex);<br>
+ list_splice(&records, &dctx->records);<br>
+ dd_report_hang(dctx);<br>
+ /* we won't actually get here */<br>
+ mtx_unlock(&dctx->mutex);<br>
+ }<br>
+ } else {<br>
+ util_queue_fence_wait(&youngest->driver_finished);<br>
}<br>
<br>
list_for_each_entry_safe(struct dd_draw_record, record, &records, list) {<br>
dd_maybe_dump_record(dscreen, record);<br>
list_del(&record->list);<br>
dd_free_record(screen, record);<br>
}<br>
<br>
mtx_lock(&dctx->mutex);<br>
}<br>
@@ -1072,20 +1067,21 @@ dd_create_record(struct dd_context *dctx)<br>
return NULL;<br>
<br>
record->dctx = dctx;<br>
record->draw_call = dctx->num_draw_calls;<br>
<br>
record->prev_bottom_of_pipe = NULL;<br>
record->top_of_pipe = NULL;<br>
record->bottom_of_pipe = NULL;<br>
record->log_page = NULL;<br>
util_queue_fence_init(&record->driver_finished);<br>
+ util_queue_fence_reset(&record->driver_finished);<br>
<br>
dd_init_copy_of_draw_state(&record->draw_state);<br>
dd_copy_draw_state(&record->draw_state.base, &dctx->draw_state);<br>
<br>
return record;<br>
}<br>
<br>
static void<br>
dd_context_flush(struct pipe_context *_pipe,<br>
struct pipe_fence_handle **fence, unsigned flags)<br>
@@ -1108,41 +1104,50 @@ dd_before_draw(struct dd_context *dctx, struct dd_draw_record *record)<br>
if (dscreen->timeout_ms > 0) {<br>
if (dscreen->flush_always && dctx->num_draw_calls >= dscreen->skip_count) {<br>
pipe->flush(pipe, &record->prev_bottom_of_pipe, 0);<br>
screen->fence_reference(screen, &record->top_of_pipe, record->prev_bottom_of_pipe);<br>
} else {<br>
pipe->flush(pipe, &record->prev_bottom_of_pipe,<br>
PIPE_FLUSH_DEFERRED | PIPE_FLUSH_BOTTOM_OF_PIPE);<br>
pipe->flush(pipe, &record->top_of_pipe,<br>
PIPE_FLUSH_DEFERRED | PIPE_FLUSH_TOP_OF_PIPE);<br>
}<br>
+ }<br>
<br>
- mtx_lock(&dctx->mutex);<br>
- dctx->record_pending = record;<br>
- if (list_empty(&dctx->records))<br>
- cnd_signal(&dctx->cond);<br>
- mtx_unlock(&dctx->mutex);<br>
+ mtx_lock(&dctx->mutex);<br>
+ if (unlikely(dctx->num_records > 10000)) {<br>
+ dctx->api_stalled = true;<br>
+ /* Since this is only a heuristic to prevent the API thread from getting<br>
+ * too far ahead, we don't need a loop here. */<br>
+ cnd_wait(&dctx->cond, &dctx->mutex);<br>
+ dctx->api_stalled = false;<br>
}<br>
+<br>
+ if (list_empty(&dctx->records))<br>
+ cnd_signal(&dctx->cond);<br>
+<br>
+ list_addtail(&record->list, &dctx->records);<br>
+ dctx->num_records++;<br>
+ mtx_unlock(&dctx->mutex);<br>
}<br>
<br>
static void<br>
dd_after_draw_async(void *data)<br>
{<br>
struct dd_draw_record *record = (struct dd_draw_record *)data;<br>
struct dd_context *dctx = record->dctx;<br>
struct dd_screen *dscreen = dd_screen(dctx->base.screen);<br>
<br>
record->log_page = u_log_new_page(&dctx->log);<br>
record->time_after = os_time_get_nano();<br>
<br>
- if (!util_queue_fence_is_signalled(&record->driver_finished))<br>
- util_queue_fence_signal(&record->driver_finished);<br>
+ util_queue_fence_signal(&record->driver_finished);<br>
<br>
if (dscreen->dump_mode == DD_DUMP_APITRACE_CALL &&<br>
dscreen->apitrace_dump_call > dctx->draw_state.apitrace_call_number) {<br>
dd_thread_join(dctx);<br>
/* No need to continue. */<br>
exit(0);<br>
}<br>
}<br>
<br>
static void<br>
@@ -1151,48 +1156,28 @@ dd_after_draw(struct dd_context *dctx, struct dd_draw_record *record)<br>
struct dd_screen *dscreen = dd_screen(dctx->base.screen);<br>
struct pipe_context *pipe = dctx->pipe;<br>
<br>
if (dscreen->timeout_ms > 0) {<br>
unsigned flush_flags;<br>
if (dscreen->flush_always && dctx->num_draw_calls >= dscreen->skip_count)<br>
flush_flags = 0;<br>
else<br>
flush_flags = PIPE_FLUSH_DEFERRED | PIPE_FLUSH_BOTTOM_OF_PIPE;<br>
pipe->flush(pipe, &record->bottom_of_pipe, flush_flags);<br>
-<br>
- assert(record == dctx->record_pending);<br>
}<br>
<br>
if (pipe->callback) {<br>
- util_queue_fence_reset(&record->driver_finished);<br>
pipe->callback(pipe, dd_after_draw_async, record, true);<br>
} else {<br>
dd_after_draw_async(record);<br>
}<br>
<br>
- mtx_lock(&dctx->mutex);<br>
- if (unlikely(dctx->num_records > 10000)) {<br>
- dctx->api_stalled = true;<br>
- /* Since this is only a heuristic to prevent the API thread from getting<br>
- * too far ahead, we don't need a loop here. */<br>
- cnd_wait(&dctx->cond, &dctx->mutex);<br>
- dctx->api_stalled = false;<br>
- }<br>
-<br>
- if (list_empty(&dctx->records))<br>
- cnd_signal(&dctx->cond);<br>
-<br>
- list_addtail(&record->list, &dctx->records);<br>
- dctx->record_pending = NULL;<br>
- dctx->num_records++;<br>
- mtx_unlock(&dctx->mutex);<br>
-<br>
++dctx->num_draw_calls;<br>
if (dscreen->skip_count && dctx->num_draw_calls % 10000 == 0)<br>
fprintf(stderr, "Gallium debugger reached %u draw calls.\n",<br>
dctx->num_draw_calls);<br>
}<br>
<br>
static void<br>
dd_context_draw_vbo(struct pipe_context *_pipe,<br>
const struct pipe_draw_info *info)<br>
{<br>
diff --git a/src/gallium/auxiliary/driver_ddebug/dd_pipe.h b/src/gallium/auxiliary/driver_ddebug/dd_pipe.h<br>
index 07c4d55017f..12da8280aa6 100644<br>
--- a/src/gallium/auxiliary/driver_ddebug/dd_pipe.h<br>
+++ b/src/gallium/auxiliary/driver_ddebug/dd_pipe.h<br>
@@ -267,20 +267,21 @@ struct dd_draw_state_copy<br>
};<br>
<br>
struct dd_draw_record {<br>
struct list_head list;<br>
struct dd_context *dctx;<br>
<br>
int64_t time_before;<br>
int64_t time_after;<br>
unsigned draw_call;<br>
<br>
+ /* The fence pointers are guaranteed to be valid once driver_finished is signalled */<br>
struct pipe_fence_handle *prev_bottom_of_pipe;<br>
struct pipe_fence_handle *top_of_pipe;<br>
struct pipe_fence_handle *bottom_of_pipe;<br>
<br>
struct dd_call call;<br>
struct dd_draw_state_copy draw_state;<br>
<br>
struct util_queue_fence driver_finished;<br>
struct u_log_page *log_page;<br>
};<br>
@@ -290,38 +291,32 @@ struct dd_context<br>
struct pipe_context base;<br>
struct pipe_context *pipe;<br>
<br>
struct dd_draw_state draw_state;<br>
unsigned num_draw_calls;<br>
<br>
struct u_log_context log;<br>
<br>
/* Pipelined hang detection.<br>
*<br>
- * This is without unnecessary flushes and waits. There is a memory-based<br>
- * fence that is incremented by clear_buffer every draw call. Driver fences<br>
- * are not used.<br>
+ * Before each draw call, a new dd_draw_record is created that contains<br>
+ * a copy of all states. After each draw call, the driver's log is added<br>
+ * to this record. Additionally, deferred fences are associated to each<br>
+ * record both before and after the draw.<br>
*<br>
- * After each draw call, a new dd_draw_record is created that contains<br>
- * a copy of all states, the output of pipe_context::dump_debug_state,<br>
- * and it has a fence number assigned. That's done without knowing whether<br>
- * that draw call is problematic or not. The record is added into the list<br>
- * of all records.<br>
- *<br>
- * An independent, separate thread loops over the list of records and checks<br>
- * their fences. Records with signalled fences are freed. On fence timeout,<br>
- * the thread dumps the records of in-flight draws.<br>
+ * The records are handed off to a separate thread which waits on the<br>
+ * records' fences. Records with signalled fences are freed. When a timeout<br>
+ * is detected, the thread dumps the records of in-flight draws.<br>
*/<br>
thrd_t thread;<br>
mtx_t mutex;<br>
cnd_t cond;<br>
- struct dd_draw_record *record_pending; /* currently inside the driver */<br>
struct list_head records; /* oldest record first */<br>
unsigned num_records;<br>
bool kill_thread;<br>
bool api_stalled;<br>
};<br>
<br>
<br>
struct pipe_context *<br>
dd_context_create(struct dd_screen *dscreen, struct pipe_context *pipe);<br>
<br>
-- <br>
2.19.1<br>
<br>
_______________________________________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org" target="_blank">mesa-dev@lists.freedesktop.org</a><br>
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/mailman/listinfo/mesa-dev</a><br>
</blockquote></div>