[Mesa-dev] [PATCH 1/2] radeonsi/gfx9: prevent a GPU hang after a timestamp event

Marek Olšák maraeo at gmail.com
Thu Aug 17 17:58:49 UTC 2017


From: Marek Olšák <marek.olsak at amd.com>

---
 src/gallium/drivers/radeon/r600_pipe_common.c | 35 ++++++++++++++++++++++++++-
 src/gallium/drivers/radeon/r600_pipe_common.h |  4 ++-
 src/gallium/drivers/radeon/r600_query.c       |  7 +++---
 src/gallium/drivers/radeonsi/si_perfcounter.c |  2 +-
 src/gallium/drivers/radeonsi/si_state_draw.c  |  4 +--
 5 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 960b59c..b28f385 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -96,28 +96,52 @@ void radeon_shader_binary_clean(struct ac_shader_binary *b)
  * \param data_sel	1 = fence, 3 = timestamp
  * \param buf		Buffer
  * \param va		GPU address
  * \param old_value	Previous fence value (for a bug workaround)
  * \param new_value	Fence value to write for this event.
  */
 void r600_gfx_write_event_eop(struct r600_common_context *ctx,
 			      unsigned event, unsigned event_flags,
 			      unsigned data_sel,
 			      struct r600_resource *buf, uint64_t va,
-			      uint32_t old_fence, uint32_t new_fence)
+			      uint32_t old_fence, uint32_t new_fence,
+			      unsigned query_type)
 {
 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
 	unsigned op = EVENT_TYPE(event) |
 		      EVENT_INDEX(5) |
 		      event_flags;
 
 	if (ctx->chip_class >= GFX9) {
+		/* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
+		 * counters) must immediately precede every timestamp event to
+		 * prevent a GPU hang on GFX9.
+		 *
+		 * Occlusion queries don't need to do it here, because they
+		 * always do ZPASS_DONE before the timestamp.
+		 */
+		if (ctx->chip_class == GFX9 &&
+		    query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
+		    query_type != PIPE_QUERY_OCCLUSION_PREDICATE) {
+			struct r600_resource *scratch = ctx->eop_bug_scratch;
+
+			assert(16 * ctx->screen->info.num_render_backends <=
+			       scratch->b.b.width0);
+			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
+			radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
+			radeon_emit(cs, scratch->gpu_address);
+			radeon_emit(cs, scratch->gpu_address >> 32);
+
+			radeon_add_to_buffer_list(ctx, &ctx->gfx, scratch,
+						  RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+		}
+
 		radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 6, 0));
 		radeon_emit(cs, op);
 		radeon_emit(cs, EOP_DATA_SEL(data_sel));
 		radeon_emit(cs, va);		/* address lo */
 		radeon_emit(cs, va >> 32);	/* address hi */
 		radeon_emit(cs, new_fence);	/* immediate data lo */
 		radeon_emit(cs, 0); /* immediate data hi */
 		radeon_emit(cs, 0); /* unused */
 	} else {
 		if (ctx->chip_class == CIK ||
@@ -648,20 +672,28 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	}
 
 	rctx->b.set_device_reset_callback = r600_set_device_reset_callback;
 
 	r600_init_context_texture_functions(rctx);
 	r600_init_viewport_functions(rctx);
 	r600_streamout_init(rctx);
 	r600_query_init(rctx);
 	cayman_init_msaa(&rctx->b);
 
+	if (rctx->chip_class == GFX9) {
+		rctx->eop_bug_scratch = (struct r600_resource*)
+			pipe_buffer_create(&rscreen->b, 0, PIPE_USAGE_DEFAULT,
+					   16 * rscreen->info.num_render_backends);
+		if (!rctx->eop_bug_scratch)
+			return false;
+	}
+
 	rctx->allocator_zeroed_memory =
 		u_suballocator_create(&rctx->b, rscreen->info.gart_page_size,
 				      0, PIPE_USAGE_DEFAULT, 0, true);
 	if (!rctx->allocator_zeroed_memory)
 		return false;
 
 	rctx->b.stream_uploader = u_upload_create(&rctx->b, 1024 * 1024,
 						  0, PIPE_USAGE_STREAM);
 	if (!rctx->b.stream_uploader)
 		return false;
@@ -717,20 +749,21 @@ void r600_common_context_cleanup(struct r600_common_context *rctx)
 		u_upload_destroy(rctx->b.const_uploader);
 
 	slab_destroy_child(&rctx->pool_transfers);
 	slab_destroy_child(&rctx->pool_transfers_unsync);
 
 	if (rctx->allocator_zeroed_memory) {
 		u_suballocator_destroy(rctx->allocator_zeroed_memory);
 	}
 	rctx->ws->fence_reference(&rctx->last_gfx_fence, NULL);
 	rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL);
+	r600_resource_reference(&rctx->eop_bug_scratch, NULL);
 }
 
 /*
  * pipe_screen
  */
 
 static const struct debug_named_value common_debug_options[] = {
 	/* logging */
 	{ "tex", DBG_TEX, "Print texture info" },
 	{ "nir", DBG_NIR, "Enable experimental NIR shaders" },
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 14bc63e..952fb77 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -553,20 +553,21 @@ struct r600_common_context {
 
 	struct r600_common_screen	*screen;
 	struct radeon_winsys		*ws;
 	struct radeon_winsys_ctx	*ctx;
 	enum radeon_family		family;
 	enum chip_class			chip_class;
 	struct r600_ring		gfx;
 	struct r600_ring		dma;
 	struct pipe_fence_handle	*last_gfx_fence;
 	struct pipe_fence_handle	*last_sdma_fence;
+	struct r600_resource		*eop_bug_scratch;
 	unsigned			num_gfx_cs_flushes;
 	unsigned			initial_gfx_cs_size;
 	unsigned			gpu_reset_counter;
 	unsigned			last_dirty_tex_counter;
 	unsigned			last_compressed_colortex_counter;
 
 	struct threaded_context		*tc;
 	struct u_suballocator		*allocator_zeroed_memory;
 	struct slab_child_pool		pool_transfers;
 	struct slab_child_pool		pool_transfers_unsync; /* for threaded_context */
@@ -740,21 +741,22 @@ r600_invalidate_resource(struct pipe_context *ctx,
 			 struct pipe_resource *resource);
 void r600_replace_buffer_storage(struct pipe_context *ctx,
 				 struct pipe_resource *dst,
 				 struct pipe_resource *src);
 
 /* r600_common_pipe.c */
 void r600_gfx_write_event_eop(struct r600_common_context *ctx,
 			      unsigned event, unsigned event_flags,
 			      unsigned data_sel,
 			      struct r600_resource *buf, uint64_t va,
-			      uint32_t old_fence, uint32_t new_fence);
+			      uint32_t old_fence, uint32_t new_fence,
+			      unsigned query_type);
 unsigned r600_gfx_write_fence_dwords(struct r600_common_screen *screen);
 void r600_gfx_wait_fence(struct r600_common_context *ctx,
 			 uint64_t va, uint32_t ref, uint32_t mask);
 void r600_draw_rectangle(struct blitter_context *blitter,
 			 int x1, int y1, int x2, int y2, float depth,
 			 enum blitter_attrib_type type,
 			 const union pipe_color_union *attrib);
 bool r600_common_screen_init(struct r600_common_screen *rscreen,
 			     struct radeon_winsys *ws);
 void r600_destroy_common_screen(struct r600_common_screen *rscreen);
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 03ea04d..53b7955 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -773,21 +773,21 @@ static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
 					COPY_DATA_DST_SEL(COPY_DATA_MEM_ASYNC));
 			radeon_emit(cs, 0);
 			radeon_emit(cs, 0);
 			radeon_emit(cs, va);
 			radeon_emit(cs, va >> 32);
 		} else {
 			/* Write the timestamp after the last draw is done.
 			 * (bottom-of-pipe)
 			 */
 			r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,
-						 0, 3, NULL, va, 0, 0);
+						 0, 3, NULL, va, 0, 0, query->b.type);
 		}
 		break;
 	case PIPE_QUERY_PIPELINE_STATISTICS:
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
 		radeon_emit(cs, va);
 		radeon_emit(cs, va >> 32);
 		break;
 	default:
 		assert(0);
@@ -858,21 +858,21 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
 	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
 		va += 16;
 		for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream)
 			emit_sample_streamout(cs, va + 32 * stream, stream);
 		break;
 	case PIPE_QUERY_TIME_ELAPSED:
 		va += 8;
 		/* fall through */
 	case PIPE_QUERY_TIMESTAMP:
 		r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,
-					 0, 3, NULL, va, 0, 0);
+					 0, 3, NULL, va, 0, 0, query->b.type);
 		fence_va = va + 8;
 		break;
 	case PIPE_QUERY_PIPELINE_STATISTICS: {
 		unsigned sample_size = (query->result_size - 8) / 2;
 
 		va += sample_size;
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
 		radeon_emit(cs, va);
 		radeon_emit(cs, va >> 32);
@@ -881,21 +881,22 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
 		break;
 	}
 	default:
 		assert(0);
 	}
 	r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
 			RADEON_PRIO_QUERY);
 
 	if (fence_va)
 		r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0, 1,
-					 query->buffer.buf, fence_va, 0, 0x80000000);
+					 query->buffer.buf, fence_va, 0, 0x80000000,
+					 query->b.type);
 }
 
 static void r600_query_hw_emit_stop(struct r600_common_context *ctx,
 				    struct r600_query_hw *query)
 {
 	uint64_t va;
 
 	if (!query->buffer.buf)
 		return; // previous buffer allocation failure
 
diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c
index 2e3a174..38aa9ad 100644
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -608,21 +608,21 @@ static void si_pc_emit_start(struct r600_common_context *ctx,
 }
 
 /* Note: The buffer was already added in si_pc_emit_start, so we don't have to
  * do it again in here. */
 static void si_pc_emit_stop(struct r600_common_context *ctx,
 			    struct r600_resource *buffer, uint64_t va)
 {
 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
 
 	r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0, 1,
-				 buffer, va, 1, 0);
+				 buffer, va, 1, 0, 0);
 	r600_gfx_wait_fence(ctx, va, 0, 0xffffffff);
 
 	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 	radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
 	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 	radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PERFCOUNTER_STOP) | EVENT_INDEX(0));
 	radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
 			       S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) |
 			       S_036020_PERFMON_SAMPLE_ENABLE(1));
 }
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 6e65155..0961289 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -891,21 +891,21 @@ void si_emit_cache_flush(struct si_context *sctx)
 					 S_0085F0_CB2_DEST_BASE_ENA(1) |
 					 S_0085F0_CB3_DEST_BASE_ENA(1) |
 					 S_0085F0_CB4_DEST_BASE_ENA(1) |
 					 S_0085F0_CB5_DEST_BASE_ENA(1) |
 					 S_0085F0_CB6_DEST_BASE_ENA(1) |
 					 S_0085F0_CB7_DEST_BASE_ENA(1);
 
 			/* Necessary for DCC */
 			if (rctx->chip_class == VI)
 				r600_gfx_write_event_eop(rctx, V_028A90_FLUSH_AND_INV_CB_DATA_TS,
-							 0, 0, NULL, 0, 0, 0);
+							 0, 0, NULL, 0, 0, 0, 0);
 		}
 		if (rctx->flags & SI_CONTEXT_FLUSH_AND_INV_DB)
 			cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
 					 S_0085F0_DB_DEST_BASE_ENA(1);
 	}
 
 	if (rctx->flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
 		/* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
@@ -993,21 +993,21 @@ void si_emit_cache_flush(struct si_context *sctx)
 			sctx->b.num_L2_invalidates++;
 		}
 
 		/* Do the flush (enqueue the event and wait for it). */
 		va = sctx->wait_mem_scratch->gpu_address;
 		sctx->wait_mem_number++;
 
 		r600_gfx_write_event_eop(rctx, cb_db_event, tc_flags, 1,
 					 sctx->wait_mem_scratch, va,
 					 sctx->wait_mem_number - 1,
-					 sctx->wait_mem_number);
+					 sctx->wait_mem_number, 0);
 		r600_gfx_wait_fence(rctx, va, sctx->wait_mem_number, 0xffffffff);
 	}
 
 	/* Make sure ME is idle (it executes most packets) before continuing.
 	 * This prevents read-after-write hazards between PFP and ME.
 	 */
 	if (cp_coher_cntl ||
 	    (rctx->flags & (SI_CONTEXT_CS_PARTIAL_FLUSH |
 			    SI_CONTEXT_INV_VMEM_L1 |
 			    SI_CONTEXT_INV_GLOBAL_L2 |
-- 
2.7.4



More information about the mesa-dev mailing list