[Mesa-dev] [PATCH 3/7] radeonsi: add SI_QUERY_TIME_ELAPSED_SDMA for measuring SDMA performance

Tue Aug 21 05:50:32 UTC 2018

From: Marek Olšák <marek.olsak at amd.com>

---
 src/amd/common/sid.h                     |  4 ++++
 src/gallium/drivers/radeonsi/si_dma_cs.c | 29 ++++++++++++++++++++++++
 src/gallium/drivers/radeonsi/si_pipe.h   |  2 ++
 src/gallium/drivers/radeonsi/si_query.c  | 21 +++++++++++++++--
 src/gallium/drivers/radeonsi/si_query.h  |  1 +
 5 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/src/amd/common/sid.h b/src/amd/common/sid.h
index d9c4a1a7414..d696c01d4dd 100644
--- a/src/amd/common/sid.h
+++ b/src/amd/common/sid.h
@@ -9133,20 +9133,24 @@
 #define        CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW  0x5
 #define        CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW    0x6
 #define    CIK_SDMA_OPCODE_WRITE                   0x2
 #define        SDMA_WRITE_SUB_OPCODE_LINEAR               0x0
 #define        SDMA_WRTIE_SUB_OPCODE_TILED                0x1
 #define    CIK_SDMA_OPCODE_INDIRECT_BUFFER         0x4
 #define    CIK_SDMA_PACKET_FENCE                   0x5
 #define    CIK_SDMA_PACKET_TRAP                    0x6
 #define    CIK_SDMA_PACKET_SEMAPHORE               0x7
 #define    CIK_SDMA_PACKET_CONSTANT_FILL           0xb
+#define    CIK_SDMA_OPCODE_TIMESTAMP               0xd
+#define        SDMA_TS_SUB_OPCODE_SET_LOCAL_TIMESTAMP     0x0
+#define        SDMA_TS_SUB_OPCODE_GET_LOCAL_TIMESTAMP     0x1
+#define        SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP    0x2
 #define    CIK_SDMA_PACKET_SRBM_WRITE              0xe
 #define    CIK_SDMA_COPY_MAX_SIZE                  0x3fffe0
 
 enum amd_cmp_class_flags {
 	S_NAN = 1 << 0,        // Signaling NaN
 	Q_NAN = 1 << 1,        // Quiet NaN
 	N_INFINITY = 1 << 2,   // Negative infinity
 	N_NORMAL = 1 << 3,     // Negative normal
 	N_SUBNORMAL = 1 << 4,  // Negative subnormal
 	N_ZERO = 1 << 5,       // Negative zero
diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c
index 3bb769309e3..7db9570af3c 100644
--- a/src/gallium/drivers/radeonsi/si_dma_cs.c
+++ b/src/gallium/drivers/radeonsi/si_dma_cs.c
@@ -16,32 +16,61 @@
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
 #include "si_pipe.h"
+#include "sid.h"
 
 static void si_dma_emit_wait_idle(struct si_context *sctx)
 {
 	struct radeon_cmdbuf *cs = sctx->dma_cs;
 
 	/* NOP waits for idle. */
 	if (sctx->chip_class >= CIK)
 		radeon_emit(cs, 0x00000000); /* NOP */
 	else
 		radeon_emit(cs, 0xf0000000); /* NOP */
 }
 
+void si_dma_emit_timestamp(struct si_context *sctx, struct r600_resource *dst,
+			   uint64_t offset)
+{
+	struct radeon_cmdbuf *cs = sctx->dma_cs;
+	uint64_t va = dst->gpu_address + offset;
+
+	if (sctx->chip_class == SI) {
+		unreachable("SI DMA doesn't support the timestamp packet.");
+		return;
+	}
+
+	/* Mark the buffer range of destination as valid (initialized),
+	 * so that transfer_map knows it should wait for the GPU when mapping
+	 * that range. */
+	util_range_add(&dst->valid_buffer_range, offset, offset + 8);
+
+	assert(va % 8 == 0);
+
+	si_need_dma_space(sctx, 4, dst, NULL);
+	si_dma_emit_wait_idle(sctx);
+
+	radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP,
+					SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP,
+					0));
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+}
+
 void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
 		       struct r600_resource *dst, struct r600_resource *src)
 {
 	uint64_t vram = ctx->dma_cs->used_vram;
 	uint64_t gtt = ctx->dma_cs->used_gart;
 
 	if (dst) {
 		vram += dst->vram_usage;
 		gtt += dst->gart_usage;
 	}
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 95489f09612..4c3f13b84e2 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1148,20 +1148,22 @@ void si_log_draw_state(struct si_context *sctx, struct u_log_context *log);
 void si_log_compute_state(struct si_context *sctx, struct u_log_context *log);
 void si_init_debug_functions(struct si_context *sctx);
 void si_check_vm_faults(struct si_context *sctx,
 			struct radeon_saved_cs *saved, enum ring_type ring);
 bool si_replace_shader(unsigned num, struct ac_shader_binary *binary);
 
 /* si_dma.c */
 void si_init_dma_functions(struct si_context *sctx);
 
 /* si_dma_cs.c */
+void si_dma_emit_timestamp(struct si_context *sctx, struct r600_resource *dst,
+			   uint64_t offset);
 void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
 		       struct r600_resource *dst, struct r600_resource *src);
 void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
 		     struct pipe_fence_handle **fence);
 void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
 			    uint64_t offset, uint64_t size, unsigned value);
 
 /* si_fence.c */
 void si_gfx_write_event_eop(struct si_context *ctx,
 			    unsigned event, unsigned event_flags,
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c
index f768b531139..93efbd4ef4a 100644
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -641,20 +641,25 @@ static struct pipe_query *si_query_hw_create(struct si_screen *sscreen,
 	query->ops = &query_hw_default_hw_ops;
 
 	switch (query_type) {
 	case PIPE_QUERY_OCCLUSION_COUNTER:
 	case PIPE_QUERY_OCCLUSION_PREDICATE:
 	case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 		query->result_size = 16 * sscreen->info.num_render_backends;
 		query->result_size += 16; /* for the fence + alignment */
 		query->num_cs_dw_end = 6 + si_gfx_write_fence_dwords(sscreen);
 		break;
+	case SI_QUERY_TIME_ELAPSED_SDMA:
+		/* GET_GLOBAL_TIMESTAMP only works if the offset is a multiple of 32. */
+		query->result_size = 64;
+		query->num_cs_dw_end = 0;
+		break;
 	case PIPE_QUERY_TIME_ELAPSED:
 		query->result_size = 24;
 		query->num_cs_dw_end = 8 + si_gfx_write_fence_dwords(sscreen);
 		break;
 	case PIPE_QUERY_TIMESTAMP:
 		query->result_size = 16;
 		query->num_cs_dw_end = 8 + si_gfx_write_fence_dwords(sscreen);
 		query->flags = SI_QUERY_HW_FLAG_NO_START;
 		break;
 	case PIPE_QUERY_PRIMITIVES_EMITTED:
@@ -740,20 +745,23 @@ static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va,
 }
 
 static void si_query_hw_do_emit_start(struct si_context *sctx,
 					struct si_query_hw *query,
 					struct r600_resource *buffer,
 					uint64_t va)
 {
 	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
 	switch (query->b.type) {
+	case SI_QUERY_TIME_ELAPSED_SDMA:
+		si_dma_emit_timestamp(sctx, buffer, va - buffer->gpu_address);
+		return;
 	case PIPE_QUERY_OCCLUSION_COUNTER:
 	case PIPE_QUERY_OCCLUSION_PREDICATE:
 	case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
 		radeon_emit(cs, va);
 		radeon_emit(cs, va >> 32);
 		break;
 	case PIPE_QUERY_PRIMITIVES_EMITTED:
 	case PIPE_QUERY_PRIMITIVES_GENERATED:
@@ -795,21 +803,22 @@ static void si_query_hw_emit_start(struct si_context *sctx,
 				   struct si_query_hw *query)
 {
 	uint64_t va;
 
 	if (!query->buffer.buf)
 		return; // previous buffer allocation failure
 
 	si_update_occlusion_query_state(sctx, query->b.type, 1);
 	si_update_prims_generated_query_state(sctx, query->b.type, 1);
 
-	si_need_gfx_cs_space(sctx);
+	if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA)
+		si_need_gfx_cs_space(sctx);
 
 	/* Get a new query buffer if needed. */
 	if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) {
 		struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
 		*qbuf = query->buffer;
 		query->buffer.results_end = 0;
 		query->buffer.previous = qbuf;
 		query->buffer.buf = si_new_query_buffer(sctx->screen, query);
 		if (!query->buffer.buf)
 			return;
@@ -825,20 +834,23 @@ static void si_query_hw_emit_start(struct si_context *sctx,
 
 static void si_query_hw_do_emit_stop(struct si_context *sctx,
 				       struct si_query_hw *query,
 				       struct r600_resource *buffer,
 				       uint64_t va)
 {
 	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	uint64_t fence_va = 0;
 
 	switch (query->b.type) {
+	case SI_QUERY_TIME_ELAPSED_SDMA:
+		si_dma_emit_timestamp(sctx, buffer, va + 32 - buffer->gpu_address);
+		return;
 	case PIPE_QUERY_OCCLUSION_COUNTER:
 	case PIPE_QUERY_OCCLUSION_PREDICATE:
 	case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
 		va += 8;
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
 		radeon_emit(cs, va);
 		radeon_emit(cs, va >> 32);
 
 		fence_va = va + sctx->screen->info.num_render_backends * 16 - 8;
@@ -1015,21 +1027,22 @@ static void si_emit_query_predication(struct si_context *ctx)
 	}
 }
 
 static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
 {
 	struct si_screen *sscreen =
 		(struct si_screen *)ctx->screen;
 
 	if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT ||
 	    query_type == PIPE_QUERY_GPU_FINISHED ||
-	    query_type >= PIPE_QUERY_DRIVER_SPECIFIC)
+	    (query_type >= PIPE_QUERY_DRIVER_SPECIFIC &&
+	     query_type != SI_QUERY_TIME_ELAPSED_SDMA))
 		return si_query_sw_create(query_type);
 
 	return si_query_hw_create(sscreen, query_type, index);
 }
 
 static void si_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_query *rquery = (struct si_query *)query;
 
@@ -1231,20 +1244,23 @@ static void si_query_hw_add_result(struct si_screen *sscreen,
 		for (unsigned i = 0; i < max_rbs; ++i) {
 			unsigned results_base = i * 16;
 			result->b = result->b ||
 				si_query_read_result(buffer + results_base, 0, 2, true) != 0;
 		}
 		break;
 	}
 	case PIPE_QUERY_TIME_ELAPSED:
 		result->u64 += si_query_read_result(buffer, 0, 2, false);
 		break;
+	case SI_QUERY_TIME_ELAPSED_SDMA:
+		result->u64 += si_query_read_result(buffer, 0, 32/4, false);
+		break;
 	case PIPE_QUERY_TIMESTAMP:
 		result->u64 = *(uint64_t*)buffer;
 		break;
 	case PIPE_QUERY_PRIMITIVES_EMITTED:
 		/* SAMPLE_STREAMOUTSTATS stores this structure:
 		 * {
 		 *    u64 NumPrimitivesWritten;
 		 *    u64 PrimitiveStorageNeeded;
 		 * }
 		 * We only need NumPrimitivesWritten here. */
@@ -1375,20 +1391,21 @@ bool si_query_hw_get_result(struct si_context *sctx,
 
 		while (results_base != qbuf->results_end) {
 			query->ops->add_result(sscreen, query, map + results_base,
 					       result);
 			results_base += query->result_size;
 		}
 	}
 
 	/* Convert the time to expected units. */
 	if (rquery->type == PIPE_QUERY_TIME_ELAPSED ||
+	    rquery->type == SI_QUERY_TIME_ELAPSED_SDMA ||
 	    rquery->type == PIPE_QUERY_TIMESTAMP) {
 		result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
 	}
 	return true;
 }
 
 static void si_restore_qbo_state(struct si_context *sctx,
 				 struct si_qbo_state *st)
 {
 	sctx->b.bind_compute_state(&sctx->b, st->saved_compute);
diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h
index 3f60208e2f8..bc3eb397bc5 100644
--- a/src/gallium/drivers/radeonsi/si_query.h
+++ b/src/gallium/drivers/radeonsi/si_query.h
@@ -102,20 +102,21 @@ enum {
 	SI_QUERY_GPU_SCRATCH_RAM_BUSY,
 	SI_QUERY_NUM_COMPILATIONS,
 	SI_QUERY_NUM_SHADERS_CREATED,
 	SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO,
 	SI_QUERY_NUM_SHADER_CACHE_HITS,
 	SI_QUERY_GPIN_ASIC_ID,
 	SI_QUERY_GPIN_NUM_SIMD,
 	SI_QUERY_GPIN_NUM_RB,
 	SI_QUERY_GPIN_NUM_SPI,
 	SI_QUERY_GPIN_NUM_SE,
+	SI_QUERY_TIME_ELAPSED_SDMA,
 
 	SI_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100,
 };
 
 enum {
 	SI_QUERY_GROUP_GPIN = 0,
 	SI_NUM_SW_QUERY_GROUPS
 };
 
 struct si_query_ops {
-- 
2.17.1