Mesa (main): radv: add initial SPM support on GFX10+

Wed Dec 8 09:05:31 UTC 2021

Module: Mesa
Branch: main
Commit: e18e857292ccdfeaa7200b397576fea18492975e
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=e18e857292ccdfeaa7200b397576fea18492975e

Author: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Date:   Fri Jun  4 16:58:33 2021 +0200

radv: add initial SPM support on GFX10+

RGP doesn't support previous generations. This can be enabled with
RADV_THREAD_TRACE_CACHE_COUNTERS=true.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13737>

---

 docs/envvars.rst                        |   3 +
 src/amd/vulkan/layers/radv_sqtt_layer.c |   7 +-
 src/amd/vulkan/meson.build              |   1 +
 src/amd/vulkan/radv_device.c            |  26 +++-
 src/amd/vulkan/radv_private.h           |  12 ++
 src/amd/vulkan/radv_spm.c               | 230 ++++++++++++++++++++++++++++++++
 src/amd/vulkan/radv_sqtt.c              |  17 +++
 7 files changed, 293 insertions(+), 3 deletions(-)

diff --git a/docs/envvars.rst b/docs/envvars.rst
index 1068f6dd79d..f397c41dcd7 100644
--- a/docs/envvars.rst
+++ b/docs/envvars.rst
@@ -723,6 +723,9 @@ RADV driver environment variables
    set the SQTT/RGP buffer size in bytes (default value is 32MiB, the buffer is
    automatically resized if too small)
 
+:envvar:`RADV_THREAD_TRACE_CACHE_COUNTERS`
+   enable/disable SQTT/RGP cache counters on GFX10+ (disabled by default)
+
 :envvar:`RADV_THREAD_TRACE_INSTRUCTION_TIMING`
    enable/disable SQTT/RGP instruction timing (enabled by default)
 
diff --git a/src/amd/vulkan/layers/radv_sqtt_layer.c b/src/amd/vulkan/layers/radv_sqtt_layer.c
index 2213d9adc96..6cfa42ee192 100644
--- a/src/amd/vulkan/layers/radv_sqtt_layer.c
+++ b/src/amd/vulkan/layers/radv_sqtt_layer.c
@@ -363,7 +363,12 @@ radv_handle_thread_trace(VkQueue _queue)
       radv_QueueWaitIdle(_queue);
 
       if (radv_get_thread_trace(queue, &thread_trace)) {
-         ac_dump_rgp_capture(&queue->device->physical_device->rad_info, &thread_trace, NULL);
+         struct ac_spm_trace_data *spm_trace = NULL;
+
+         if (queue->device->spm_trace.bo)
+            spm_trace = &queue->device->spm_trace;
+
+         ac_dump_rgp_capture(&queue->device->physical_device->rad_info, &thread_trace, spm_trace);
       } else {
          /* Trigger a new capture if the driver failed to get
           * the trace because the buffer was too small.
diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build
index 04659a0098c..2e365b7a192 100644
--- a/src/amd/vulkan/meson.build
+++ b/src/amd/vulkan/meson.build
@@ -81,6 +81,7 @@ libradv_files = files(
   'radv_shader_args.c',
   'radv_shader_args.h',
   'radv_shader_info.c',
+  'radv_spm.c',
   'radv_sqtt.c',
   'radv_query.c',
   'radv_util.c',
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index aac5f842b76..c2dc58532c7 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -347,6 +347,13 @@ radv_thread_trace_enabled()
           getenv("RADV_THREAD_TRACE_TRIGGER");
 }
 
+static bool
+radv_spm_trace_enabled()
+{
+   return radv_thread_trace_enabled() &&
+          debug_get_bool_option("RADV_THREAD_TRACE_CACHE_COUNTERS", false);
+}
+
 #if defined(VK_USE_PLATFORM_WAYLAND_KHR) || defined(VK_USE_PLATFORM_XCB_KHR) ||                    \
    defined(VK_USE_PLATFORM_XLIB_KHR) || defined(VK_USE_PLATFORM_DISPLAY_KHR)
 #define RADV_USE_WSI_PLATFORM
@@ -3149,9 +3156,20 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
          goto fail;
 
       fprintf(stderr, "radv: Thread trace support is enabled (initial buffer size: %u MiB, "
-                      "instruction timing: %s).\n",
+                      "instruction timing: %s, cache counters: %s).\n",
               device->thread_trace.buffer_size / (1024 * 1024),
-              radv_is_instruction_timing_enabled() ? "enabled" : "disabled");
+              radv_is_instruction_timing_enabled() ? "enabled" : "disabled",
+              radv_spm_trace_enabled() ? "enabled" : "disabled");
+
+      if (radv_spm_trace_enabled()) {
+         if (device->physical_device->rad_info.chip_class < GFX10) {
+            fprintf(stderr, "SPM isn't supported for this GPU!\n");
+            abort();
+         }
+
+         if (!radv_spm_init(device))
+            goto fail;
+      }
    }
 
    if (getenv("RADV_TRAP_HANDLER")) {
@@ -3273,6 +3291,8 @@ fail_meta:
 fail:
    radv_thread_trace_finish(device);
 
+   radv_spm_finish(device);
+
    radv_trap_handler_finish(device);
    radv_finish_trace(device);
 
@@ -3342,6 +3362,8 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
 
    radv_thread_trace_finish(device);
 
+   radv_spm_finish(device);
+
    vk_device_finish(&device->vk);
    vk_free(&device->vk.alloc, device);
 }
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index cce48e21424..31ac95628d3 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -70,6 +70,7 @@
 #include "ac_binary.h"
 #include "ac_gpu_info.h"
 #include "ac_shader_util.h"
+#include "ac_spm.h"
 #include "ac_sqtt.h"
 #include "ac_surface.h"
 #include "radv_constants.h"
@@ -834,6 +835,12 @@ struct radv_device {
    /* Thread trace. */
    struct ac_thread_trace_data thread_trace;
 
+   /* SPM. */
+   struct ac_spm_trace_data spm_trace;
+
+   /* Performance counters. */
+   struct ac_perfcounters perfcounters;
+
    /* Trap handler. */
    struct radv_shader *trap_handler_shader;
    struct radeon_winsys_bo *tma_bo; /* Trap Memory Address */
@@ -2928,6 +2935,11 @@ void radv_perfcounter_emit_reset(struct radeon_cmdbuf *cs);
 void radv_perfcounter_emit_start(struct radv_device *device, struct radeon_cmdbuf *cs, int family);
 void radv_perfcounter_emit_stop(struct radv_device *device, struct radeon_cmdbuf *cs, int family);
 
+/* radv_spm.c */
+bool radv_spm_init(struct radv_device *device);
+void radv_spm_finish(struct radv_device *device);
+void radv_emit_spm_setup(struct radv_device *device, struct radeon_cmdbuf *cs);
+
 #define RADV_FROM_HANDLE(__radv_type, __name, __handle) \
    VK_FROM_HANDLE(__radv_type, __name, __handle)
 
diff --git a/src/amd/vulkan/radv_spm.c b/src/amd/vulkan/radv_spm.c
new file mode 100644
index 00000000000..f8669fee040
--- /dev/null
+++ b/src/amd/vulkan/radv_spm.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright © 2021 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <inttypes.h>
+
+#include "radv_cs.h"
+#include "radv_private.h"
+#include "sid.h"
+
+#define SPM_RING_BASE_ALIGN 32
+
+static bool
+radv_spm_init_bo(struct radv_device *device)
+{
+   struct radeon_winsys *ws = device->ws;
+   uint64_t size = 32 * 1024 * 1024; /* Default to 1MB. */
+   uint16_t sample_interval = 4096; /* Default to 4096 clk. */
+   VkResult result;
+
+   device->spm_trace.buffer_size = size;
+   device->spm_trace.sample_interval = sample_interval;
+
+   struct radeon_winsys_bo *bo = NULL;
+   result = ws->buffer_create(
+      ws, size, 4096, RADEON_DOMAIN_VRAM,
+      RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_ZERO_VRAM,
+      RADV_BO_PRIORITY_SCRATCH, 0, &bo);
+   device->spm_trace.bo = bo;
+   if (result != VK_SUCCESS)
+      return false;
+
+   result = ws->buffer_make_resident(ws, device->spm_trace.bo, true);
+   if (result != VK_SUCCESS)
+      return false;
+
+   device->spm_trace.ptr = ws->buffer_map(device->spm_trace.bo);
+   if (!device->spm_trace.ptr)
+      return false;
+
+   return true;
+}
+
+static void
+radv_emit_spm_counters(struct radv_device *device, struct radeon_cmdbuf *cs)
+{
+   struct ac_spm_trace_data *spm_trace = &device->spm_trace;
+
+   for (uint32_t b = 0; b < spm_trace->num_used_sq_block_sel; b++) {
+      struct ac_spm_block_select *sq_block_sel = &spm_trace->sq_block_sel[b];
+      const struct ac_spm_counter_select *cntr_sel = &sq_block_sel->counters[0];
+      uint32_t reg_base = R_036700_SQ_PERFCOUNTER0_SELECT;
+
+      radeon_set_uconfig_reg_seq(cs, reg_base + b * 4, 1);
+      radeon_emit(cs, cntr_sel->sel0 | S_036700_SQC_BANK_MASK(0xf)); /* SQC_BANK_MASK only gfx10 */
+   }
+
+   for (uint32_t b = 0; b < spm_trace->num_block_sel; b++) {
+      struct ac_spm_block_select *block_sel = &spm_trace->block_sel[b];
+      struct ac_pc_block_base *regs = block_sel->b->b->b;
+
+      radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, block_sel->grbm_gfx_index);
+
+      for (unsigned c = 0; c < block_sel->num_counters; c++) {
+         const struct ac_spm_counter_select *cntr_sel = &block_sel->counters[c];
+
+         if (!cntr_sel->active)
+            continue;
+
+         radeon_set_uconfig_reg_seq(cs, regs->select0[c], 1);
+         radeon_emit(cs, cntr_sel->sel0);
+
+         radeon_set_uconfig_reg_seq(cs, regs->select1[c], 1);
+         radeon_emit(cs, cntr_sel->sel1);
+      }
+   }
+
+   /* Restore global broadcasting. */
+   radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
+                              S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) |
+                              S_030800_INSTANCE_BROADCAST_WRITES(1));
+}
+
+void
+radv_emit_spm_setup(struct radv_device *device, struct radeon_cmdbuf *cs)
+{
+   struct ac_spm_trace_data *spm_trace = &device->spm_trace;
+   uint64_t va = radv_buffer_get_va(spm_trace->bo);
+   uint64_t ring_size = spm_trace->buffer_size;
+
+   /* It's required that the ring VA and the size are correctly aligned. */
+   assert(!(va & (SPM_RING_BASE_ALIGN - 1)));
+   assert(!(ring_size & (SPM_RING_BASE_ALIGN - 1)));
+   assert(spm_trace->sample_interval >= 32);
+
+   /* Configure the SPM ring buffer. */
+   radeon_set_uconfig_reg(cs, R_037200_RLC_SPM_PERFMON_CNTL,
+                              S_037200_PERFMON_RING_MODE(0) | /* no stall and no interrupt on overflow */
+                              S_037200_PERFMON_SAMPLE_INTERVAL(spm_trace->sample_interval)); /* in sclk */
+   radeon_set_uconfig_reg(cs, R_037204_RLC_SPM_PERFMON_RING_BASE_LO, va);
+   radeon_set_uconfig_reg(cs, R_037208_RLC_SPM_PERFMON_RING_BASE_HI,
+                              S_037208_RING_BASE_HI(va >> 32));
+   radeon_set_uconfig_reg(cs, R_03720C_RLC_SPM_PERFMON_RING_SIZE, ring_size);
+
+   /* Configure the muxsel. */
+   uint32_t total_muxsel_lines = 0;
+   for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
+      total_muxsel_lines += spm_trace->num_muxsel_lines[s];
+   }
+
+   radeon_set_uconfig_reg(cs, R_03726C_RLC_SPM_ACCUM_MODE, 0);
+   radeon_set_uconfig_reg(cs, R_037210_RLC_SPM_PERFMON_SEGMENT_SIZE, 0);
+   radeon_set_uconfig_reg(cs, R_03727C_RLC_SPM_PERFMON_SE3TO0_SEGMENT_SIZE,
+                              S_03727C_SE0_NUM_LINE(spm_trace->num_muxsel_lines[0]) |
+                              S_03727C_SE1_NUM_LINE(spm_trace->num_muxsel_lines[1]) |
+                              S_03727C_SE2_NUM_LINE(spm_trace->num_muxsel_lines[2]) |
+                              S_03727C_SE3_NUM_LINE(spm_trace->num_muxsel_lines[3]));
+   radeon_set_uconfig_reg(cs, R_037280_RLC_SPM_PERFMON_GLB_SEGMENT_SIZE,
+                              S_037280_PERFMON_SEGMENT_SIZE(total_muxsel_lines) |
+                              S_037280_GLOBAL_NUM_LINE(spm_trace->num_muxsel_lines[4]));
+
+   /* Upload each muxsel ram to the RLC. */
+   for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
+      unsigned rlc_muxsel_addr, rlc_muxsel_data;
+      unsigned grbm_gfx_index = S_030800_SH_BROADCAST_WRITES(1) |
+                                S_030800_INSTANCE_BROADCAST_WRITES(1);
+
+      if (!spm_trace->num_muxsel_lines[s])
+         continue;
+
+      if (s == AC_SPM_SEGMENT_TYPE_GLOBAL) {
+         grbm_gfx_index |= S_030800_SE_BROADCAST_WRITES(1);
+
+         rlc_muxsel_addr = R_037224_RLC_SPM_GLOBAL_MUXSEL_ADDR;
+         rlc_muxsel_data = R_037228_RLC_SPM_GLOBAL_MUXSEL_DATA;
+      } else {
+         grbm_gfx_index |= S_030800_SE_INDEX(s);
+
+         rlc_muxsel_addr = R_03721C_RLC_SPM_SE_MUXSEL_ADDR;
+         rlc_muxsel_data = R_037220_RLC_SPM_SE_MUXSEL_DATA;
+      }
+
+      radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, grbm_gfx_index);
+
+      for (unsigned l = 0; l < spm_trace->num_muxsel_lines[s]; l++) {
+         uint32_t *data = (uint32_t *)spm_trace->muxsel_lines[s][l].muxsel;
+
+         /* Select MUXSEL_ADDR to point to the next muxsel. */
+         radeon_set_uconfig_reg(cs, rlc_muxsel_addr, l * AC_SPM_MUXSEL_LINE_SIZE);
+
+         /* Write the muxsel line configuration with MUXSEL_DATA. */
+         radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + AC_SPM_MUXSEL_LINE_SIZE, 0));
+         radeon_emit(cs, S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) |
+                         S_370_WR_CONFIRM(1) |
+                         S_370_ENGINE_SEL(V_370_ME) |
+                         S_370_WR_ONE_ADDR(1));
+         radeon_emit(cs, rlc_muxsel_data >> 2);
+         radeon_emit(cs, 0);
+         radeon_emit_array(cs, data, AC_SPM_MUXSEL_LINE_SIZE);
+      }
+   }
+
+   /* Select SPM counters. */
+   radv_emit_spm_counters(device, cs);
+}
+
+bool
+radv_spm_init(struct radv_device *device)
+{
+   const struct radeon_info *info = &device->physical_device->rad_info;
+   struct ac_perfcounters *pc = &device->perfcounters;
+   struct ac_spm_counter_create_info spm_counters[] = {
+      {TCP, 0, 0x9},    /* Number of L2 requests. */
+      {TCP, 0, 0x12},   /* Number of L2 misses. */
+      {SQ, 0, 0x14f},   /* Number of SCACHE hits. */
+      {SQ, 0, 0x150},   /* Number of SCACHE misses. */
+      {SQ, 0, 0x151},   /* Number of SCACHE misses duplicate. */
+      {SQ, 0, 0x12c},   /* Number of ICACHE hits. */
+      {SQ, 0, 0x12d},   /* Number of ICACHE misses. */
+      {SQ, 0, 0x12e},   /* Number of ICACHE misses duplicate. */
+      {GL1C, 0, 0xe},   /* Number of GL1C requests. */
+      {GL1C, 0, 0x12},  /* Number of GL1C misses. */
+      {GL2C, 0, 0x3},   /* Number of GL2C requests. */
+      {GL2C, 0, info->chip_class >= GFX10_3 ? 0x2b : 0x23},  /* Number of GL2C misses. */
+   };
+
+   if (!ac_init_perfcounters(info, false, false, pc))
+      return false;
+
+   if (!ac_init_spm(info, pc, ARRAY_SIZE(spm_counters), spm_counters, &device->spm_trace))
+      return false;
+
+   if (!radv_spm_init_bo(device))
+      return false;
+
+   return true;
+}
+
+void
+radv_spm_finish(struct radv_device *device)
+{
+   struct radeon_winsys *ws = device->ws;
+
+   if (device->spm_trace.bo) {
+      ws->buffer_make_resident(ws, device->spm_trace.bo, false);
+      ws->buffer_destroy(ws, device->spm_trace.bo);
+   }
+
+   ac_destroy_spm(&device->spm_trace);
+   ac_destroy_perfcounters(&device->perfcounters);
+}
diff --git a/src/amd/vulkan/radv_sqtt.c b/src/amd/vulkan/radv_sqtt.c
index f7a996f500c..4180fba5950 100644
--- a/src/amd/vulkan/radv_sqtt.c
+++ b/src/amd/vulkan/radv_sqtt.c
@@ -543,9 +543,21 @@ radv_begin_thread_trace(struct radv_queue *queue)
    /* Enable SQG events that collects thread trace data. */
    radv_emit_spi_config_cntl(device, cs, true);
 
+   radv_perfcounter_emit_reset(cs);
+
+   if (device->spm_trace.bo) {
+      /* Enable all shader stages by default. */
+      radv_perfcounter_emit_shaders(cs, 0x7f);
+
+      radv_emit_spm_setup(device, cs);
+   }
+
    /* Start SQTT. */
    radv_emit_thread_trace_start(device, cs, family);
 
+   if (device->spm_trace.bo)
+      radv_perfcounter_emit_start(device, cs, family);
+
    result = ws->cs_finalize(cs);
    if (result != VK_SUCCESS) {
       ws->cs_destroy(cs);
@@ -591,9 +603,14 @@ radv_end_thread_trace(struct radv_queue *queue)
    /* Make sure to wait-for-idle before stopping SQTT. */
    radv_emit_wait_for_idle(device, cs, family);
 
+   if (device->spm_trace.bo)
+      radv_perfcounter_emit_stop(device, cs, family);
+
    /* Stop SQTT. */
    radv_emit_thread_trace_stop(device, cs, family);
 
+   radv_perfcounter_emit_reset(cs);
+
    /* Restore previous state by disabling SQG events. */
    radv_emit_spi_config_cntl(device, cs, false);