Mesa (main): v3dv: Implement VK_KHR_performance_query

Mon Jun 27 08:08:59 UTC 2022

Module: Mesa
Branch: main
Commit: f392b6c1ad4a360a02eb2a4024e3d7bb03a4b759
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=f392b6c1ad4a360a02eb2a4024e3d7bb03a4b759

Author: Ella Stanforth <estanforth at igalia.com>
Date:   Tue Nov 23 22:29:48 2021 +0000

v3dv: Implement VK_KHR_performance_query

Reviewed-by: Iago Toral Quiroga <itoral at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14061>

---

 docs/features.txt                     |   2 +-
 src/broadcom/vulkan/v3dv_cmd_buffer.c |  90 ++++++-
 src/broadcom/vulkan/v3dv_device.c     |  22 ++
 src/broadcom/vulkan/v3dv_private.h    |  62 ++++-
 src/broadcom/vulkan/v3dv_query.c      | 468 ++++++++++++++++++++++++++++++----
 src/broadcom/vulkan/v3dv_queue.c      | 154 +++++++++--
 6 files changed, 713 insertions(+), 85 deletions(-)

diff --git a/docs/features.txt b/docs/features.txt
index 047f6ee4d86..cd25d33f3ee 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -510,7 +510,7 @@ Khronos extensions that are not part of any Vulkan version:
   VK_KHR_get_display_properties2                        DONE (anv, lvp, radv, tu, v3dv)
   VK_KHR_get_surface_capabilities2                      DONE (anv, lvp, radv, tu, v3dv, vn)
   VK_KHR_incremental_present                            DONE (anv, lvp, radv, tu, v3dv, vn)
-  VK_KHR_performance_query                              DONE (anv/gen8+, tu)
+  VK_KHR_performance_query                              DONE (anv/gen8+, tu, v3dv)
   VK_KHR_pipeline_executable_properties                 DONE (anv, radv, tu, v3dv)
   VK_KHR_pipeline_library                               DONE (lvp, radv)
   VK_KHR_push_descriptor                                DONE (anv, lvp, radv, tu)
diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index 8fd5758ff29..f4e6a9956c7 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -777,6 +777,8 @@ v3dv_job_init(struct v3dv_job *job,
       job->is_transfer = cmd_buffer->state.is_transfer;
 
       cmd_buffer_serialize_job_if_needed(cmd_buffer, job);
+
+      job->perf = cmd_buffer->state.query.active_query.perf;
    }
 }
 
@@ -3223,24 +3225,44 @@ v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer,
                             uint32_t query,
                             VkQueryControlFlags flags)
 {
-   /* FIXME: we only support one active query for now */
-   assert(cmd_buffer->state.query.active_query.bo == NULL);
    assert(query < pool->query_count);
+   switch (pool->query_type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+      /* FIXME: we only support one active occlusion query for now */
+      assert(cmd_buffer->state.query.active_query.bo == NULL);
+
+      cmd_buffer->state.query.active_query.bo = pool->queries[query].bo;
+      cmd_buffer->state.query.active_query.offset = pool->queries[query].offset;
+      cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
+      break;
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+      assert(cmd_buffer->state.query.active_query.perf == NULL);
+      if (cmd_buffer->state.pass)
+         v3dv_cmd_buffer_subpass_finish(cmd_buffer);
 
-   cmd_buffer->state.query.active_query.bo = pool->queries[query].bo;
-   cmd_buffer->state.query.active_query.offset = pool->queries[query].offset;
-   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
+      cmd_buffer->state.query.active_query.perf =
+         &pool->queries[query].perf;
+
+      if (cmd_buffer->state.pass) {
+         v3dv_cmd_buffer_subpass_resume(cmd_buffer,
+            cmd_buffer->state.subpass_idx);
+      }
+      break;
+   }
+   default:
+      unreachable("Unsupported query type");
+   }
 }
 
-void
-v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
-                          struct v3dv_query_pool *pool,
-                          uint32_t query)
+static void
+v3dv_cmd_buffer_schedule_end_query(struct v3dv_cmd_buffer *cmd_buffer,
+                                   struct v3dv_query_pool *pool,
+                                   uint32_t query)
 {
    assert(query < pool->query_count);
-   assert(cmd_buffer->state.query.active_query.bo != NULL);
 
-   if  (cmd_buffer->state.pass) {
+   if  (cmd_buffer->state.pass &&
+        pool->query_type != VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
       /* Queue the EndQuery in the command buffer state, we will create a CPU
        * job to flag all of these queries as possibly available right after the
        * render pass job in which they have been recorded.
@@ -3295,11 +3317,57 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
 
       list_addtail(&job->list_link, &cmd_buffer->jobs);
    }
+}
+
+static void
+v3dv_cmd_buffer_end_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer,
+                                    struct v3dv_query_pool *pool,
+                                    uint32_t query)
+{
+   assert(query < pool->query_count);
+   assert(cmd_buffer->state.query.active_query.bo != NULL);
+
+   v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query);
 
    cmd_buffer->state.query.active_query.bo = NULL;
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
 }
 
+static void
+v3dv_cmd_buffer_end_performance_query(struct v3dv_cmd_buffer *cmd_buffer,
+                                      struct v3dv_query_pool *pool,
+                                      uint32_t query)
+{
+   assert(query < pool->query_count);
+   assert(cmd_buffer->state.query.active_query.perf != NULL);
+
+   if (cmd_buffer->state.pass)
+      v3dv_cmd_buffer_subpass_finish(cmd_buffer);
+
+   v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query);
+
+   cmd_buffer->state.query.active_query.perf = NULL;
+
+   if (cmd_buffer->state.pass)
+      v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx);
+}
+
+void v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
+                               struct v3dv_query_pool *pool,
+                               uint32_t query)
+{
+   switch (pool->query_type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+      v3dv_cmd_buffer_end_occlusion_query(cmd_buffer, pool, query);
+      break;
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+      v3dv_cmd_buffer_end_performance_query(cmd_buffer, pool, query);
+      break;
+   default:
+      unreachable("Unsupported query type");
+   }
+}
+
 void
 v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,
                                    struct v3dv_query_pool *pool,
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index 21ffdbbc07b..6102b0b42cf 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -133,6 +133,7 @@ get_device_extensions(const struct v3dv_physical_device *device,
       .KHR_get_memory_requirements2        = true,
       .KHR_image_format_list               = true,
       .KHR_imageless_framebuffer           = true,
+      .KHR_performance_query               = device->caps.perfmon,
       .KHR_relaxed_block_layout            = true,
       .KHR_maintenance1                    = true,
       .KHR_maintenance2                    = true,
@@ -816,6 +817,9 @@ physical_device_init(struct v3dv_physical_device *device,
    device->caps.multisync =
       v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_MULTISYNC_EXT);
 
+   device->caps.perfmon =
+      v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_PERFMON);
+
    result = init_uuids(device);
    if (result != VK_SUCCESS)
       goto fail;
@@ -1144,6 +1148,7 @@ VKAPI_ATTR void VKAPI_CALL
 v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
                                 VkPhysicalDeviceFeatures2 *pFeatures)
 {
+   V3DV_FROM_HANDLE(v3dv_physical_device, physical_device, physicalDevice);
    v3dv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features);
 
    VkPhysicalDeviceVulkan13Features vk13 = {
@@ -1289,6 +1294,16 @@ v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
          break;
       }
 
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_FEATURES_KHR: {
+         VkPhysicalDevicePerformanceQueryFeaturesKHR *features =
+            (void *) ext;
+
+         features->performanceCounterQueryPools =
+            physical_device->caps.perfmon;
+         features->performanceCounterMultipleQueryPools = false;
+         break;
+      }
+
       default:
          v3dv_debug_ignored_stype(ext->sType);
          break;
@@ -1637,6 +1652,13 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          props->maxVertexAttribDivisor = 0xffff;
          break;
       }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_PROPERTIES_KHR : {
+         VkPhysicalDevicePerformanceQueryPropertiesKHR *props =
+            (VkPhysicalDevicePerformanceQueryPropertiesKHR *)ext;
+
+         props->allowCommandBufferQueryCopies = true;
+         break;
+      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT: {
          VkPhysicalDeviceDrmPropertiesEXT *props =
             (VkPhysicalDeviceDrmPropertiesEXT *)ext;
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index 6c1399b04d7..cfd32ec7ad6 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -188,6 +188,7 @@ struct v3dv_physical_device {
 
    struct {
       bool multisync;
+      bool perfmon;
    } caps;
 };
 
@@ -263,6 +264,11 @@ struct v3dv_queue {
    struct v3dv_last_job_sync last_job_syncs;
 
    struct v3dv_job *noop_job;
+
+   /* The last active perfmon ID to prevent mixing of counter results when a
+    * job is submitted with a different perfmon id.
+    */
+   uint32_t last_perfmon_id;
 };
 
 VkResult v3dv_queue_driver_submit(struct vk_queue *vk_queue,
@@ -1027,6 +1033,19 @@ struct v3dv_timestamp_query_cpu_job_info {
    uint32_t count;
 };
 
+/* Number of perfmons required to handle all supported performance counters */
+#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_PERFCNT_NUM, \
+                                       DRM_V3D_MAX_PERF_COUNTERS)
+
+struct v3dv_perf_query {
+   uint32_t kperfmon_ids[V3DV_MAX_PERFMONS];
+
+   /* A DRM syncobj to wait on the GPU jobs for which we are collecting
+    * performance data.
+    */
+   struct vk_sync *last_job_sync;
+};
+
 struct v3dv_job {
    struct list_head list_link;
 
@@ -1127,6 +1146,9 @@ struct v3dv_job {
       uint32_t wg_base[3];
       struct drm_v3d_submit_csd submit;
    } csd;
+
+   /* Perfmons with last job sync for CSD and CL jobs */
+   struct v3dv_perf_query *perf;
 };
 
 void v3dv_job_init(struct v3dv_job *job,
@@ -1328,12 +1350,15 @@ struct v3dv_cmd_buffer_state {
          struct v3dv_end_query_cpu_job_info *states;
       } end;
 
-      /* This BO is not NULL if we have an active query, that is, we have
-       * called vkCmdBeginQuery but not vkCmdEndQuery.
-       */
       struct {
+         /* This BO is not NULL if we have an active occlusion query, that is,
+          * we have called vkCmdBeginQuery but not vkCmdEndQuery.
+          */
          struct v3dv_bo *bo;
          uint32_t offset;
+
+         /* This pointer is not NULL if we have an active performance query */
+         struct v3dv_perf_query *perf;
       } active_query;
    } query;
 };
@@ -1375,6 +1400,9 @@ struct v3dv_query {
       };
       /* Used by CPU queries (timestamp) */
       uint64_t value;
+
+      /* Used by performance queries */
+      struct v3dv_perf_query perf;
    };
 };
 
@@ -1383,18 +1411,32 @@ struct v3dv_query_pool {
 
    struct v3dv_bo *bo; /* Only used with GPU queries (occlusion) */
 
+   /* Only used with performance queries */
+   struct {
+      uint32_t ncounters;
+      uint8_t counters[V3D_PERFCNT_NUM];
+
+      /* V3D has a limit on the number of counters we can track in a
+       * single performance monitor, so if too many counters are requested
+       * we need to create multiple monitors to record all of them. This
+       * field represents the number of monitors required for the number
+       * of counters requested.
+       */
+      uint8_t nperfmons;
+   } perfmon;
+
    VkQueryType query_type;
    uint32_t query_count;
    struct v3dv_query *queries;
 };
 
-VkResult v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
-                                         struct v3dv_query_pool *pool,
-                                         uint32_t first,
-                                         uint32_t count,
-                                         void *data,
-                                         VkDeviceSize stride,
-                                         VkQueryResultFlags flags);
+VkResult v3dv_get_query_pool_results(struct v3dv_device *device,
+                                     struct v3dv_query_pool *pool,
+                                     uint32_t first,
+                                     uint32_t count,
+                                     void *data,
+                                     VkDeviceSize stride,
+                                     VkQueryResultFlags flags);
 
 void v3dv_reset_query_pools(struct v3dv_device *device,
                             struct v3dv_query_pool *query_pool,
diff --git a/src/broadcom/vulkan/v3dv_query.c b/src/broadcom/vulkan/v3dv_query.c
index 60edfc52442..4e188fd5512 100644
--- a/src/broadcom/vulkan/v3dv_query.c
+++ b/src/broadcom/vulkan/v3dv_query.c
@@ -25,6 +25,148 @@
 
 #include "util/timespec.h"
 
+static const char *v3dv_counters[][3] = {
+   {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
+   {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
+   {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"},
+   {"FEP", "FEP-valid-quads", "[FEP] Valid quads"},
+   {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"},
+   {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"},
+   {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"},
+   {"TLB", "TLB-quads-with-zero-coverage", "[TLB] Quads with all pixels having zero coverage"},
+   {"TLB", "TLB-quads-with-non-zero-coverage", "[TLB] Quads with any pixels having non-zero coverage"},
+   {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"},
+   {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"},
+   {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"},
+   {"PTB", "PTB-primitives-discared-reversed", "[PTB] Primitives that are discarded because they are reversed"},
+   {"QPU", "QPU-total-idle-clk-cycles", "[QPU] Total idle clock cycles for all QPUs"},
+   {"QPU", "QPU-total-active-clk-cycles-vertex-coord-shading", "[QPU] Total active clock cycles for all QPUs doing vertex/coordinate/user shading (counts only when QPU is not stalled)"},
+   {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"},
+   {"QPU", "QPU-total-clk-cycles-executing-valid-instr", "[QPU] Total clock cycles for all QPUs executing valid instructions"},
+   {"QPU", "QPU-total-clk-cycles-waiting-TMU", "[QPU] Total clock cycles for all QPUs stalled waiting for TMUs only (counter won't increment if QPU also stalling for another reason)"},
+   {"QPU", "QPU-total-clk-cycles-waiting-scoreboard", "[QPU] Total clock cycles for all QPUs stalled waiting for Scoreboard only (counter won't increment if QPU also stalling for another reason)"},
+   {"QPU", "QPU-total-clk-cycles-waiting-varyings", "[QPU] Total clock cycles for all QPUs stalled waiting for Varyings only (counter won't increment if QPU also stalling for another reason)"},
+   {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"},
+   {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"},
+   {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"},
+   {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"},
+   {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"},
+   {"TMU", "TMU-total-text-cache-miss", "[TMU] Total texture cache misses (number of fetches from memory/L2cache)"},
+   {"VPM", "VPM-total-clk-cycles-VDW-stalled", "[VPM] Total clock cycles VDW is stalled waiting for VPM access"},
+   {"VPM", "VPM-total-clk-cycles-VCD-stalled", "[VPM] Total clock cycles VCD is stalled waiting for VPM access"},
+   {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"},
+   {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"},
+   {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"},
+   {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"},
+   {"CORE", "cycle-count", "[CORE] Cycle counter"},
+   {"QPU", "QPU-total-clk-cycles-waiting-vertex-coord-shading", "[QPU] Total stalled clock cycles for all QPUs doing vertex/coordinate/user shading"},
+   {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"},
+   {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"},
+   {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"},
+   {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"},
+   {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"},
+   {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"},
+   {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"},
+   {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"},
+   {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"},
+   {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"},
+   {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"},
+   {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"},
+   {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"},
+   {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"},
+   {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"},
+   {"TMU", "TMU-total-config-access", "[TMU] Total config accesses"},
+   {"L2T", "L2T-no-id-stalled", "[L2T] No ID stall"},
+   {"L2T", "L2T-command-queue-stalled", "[L2T] Command queue full stall"},
+   {"L2T", "L2T-TMU-writes", "[L2T] TMU write accesses"},
+   {"TMU", "TMU-active-cycles", "[TMU] Active cycles"},
+   {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"},
+   {"CLE", "CLE-thread-active-cycles", "[CLE] Bin or render thread active cycles"},
+   {"L2T", "L2T-TMU-reads", "[L2T] TMU read accesses"},
+   {"L2T", "L2T-CLE-reads", "[L2T] CLE read accesses"},
+   {"L2T", "L2T-VCD-reads", "[L2T] VCD read accesses"},
+   {"L2T", "L2T-TMU-config-reads", "[L2T] TMU CFG read accesses"},
+   {"L2T", "L2T-SLC0-reads", "[L2T] SLC0 read accesses"},
+   {"L2T", "L2T-SLC1-reads", "[L2T] SLC1 read accesses"},
+   {"L2T", "L2T-SLC2-reads", "[L2T] SLC2 read accesses"},
+   {"L2T", "L2T-TMU-write-miss", "[L2T] TMU write misses"},
+   {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"},
+   {"L2T", "L2T-CLE-read-miss", "[L2T] CLE read misses"},
+   {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"},
+   {"L2T", "L2T-TMU-config-read-miss", "[L2T] TMU CFG read misses"},
+   {"L2T", "L2T-SLC0-read-miss", "[L2T] SLC0 read misses"},
+   {"L2T", "L2T-SLC1-read-miss", "[L2T] SLC1 read misses"},
+   {"L2T", "L2T-SLC2-read-miss", "[L2T] SLC2 read misses"},
+   {"CORE", "core-memory-writes", "[CORE] Total memory writes"},
+   {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"},
+   {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"},
+   {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"},
+   {"CORE", "core-memory-reads", "[CORE] Total memory reads"},
+   {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"},
+   {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"},
+   {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"},
+   {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"},
+   {"GMP", "GMP-memory-reads", "[GMP] Total memory reads"},
+   {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"},
+   {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"},
+   {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"},
+   {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"},
+   {"TMU", "TMU-MRU-hits", "[TMU] Total MRU hits"},
+   {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
+};
+
+static void
+kperfmon_create(struct v3dv_device *device,
+                struct v3dv_query_pool *pool,
+                uint32_t query)
+{
+   for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
+      assert(i * DRM_V3D_MAX_PERF_COUNTERS < pool->perfmon.ncounters);
+
+      struct drm_v3d_perfmon_create req = {
+         .ncounters = MIN2(pool->perfmon.ncounters -
+                           i * DRM_V3D_MAX_PERF_COUNTERS,
+                           DRM_V3D_MAX_PERF_COUNTERS),
+      };
+      memcpy(req.counters,
+             &pool->perfmon.counters[i * DRM_V3D_MAX_PERF_COUNTERS],
+             req.ncounters);
+
+      int ret = v3dv_ioctl(device->pdevice->render_fd,
+                           DRM_IOCTL_V3D_PERFMON_CREATE,
+                           &req);
+      if (ret)
+         fprintf(stderr, "Failed to create perfmon: %s\n", strerror(ret));
+
+      pool->queries[query].perf.kperfmon_ids[i] = req.id;
+   }
+}
+
+static void
+kperfmon_destroy(struct v3dv_device *device,
+                 struct v3dv_query_pool *pool,
+                 uint32_t query)
+{
+   /* Skip destroying if never created */
+   if (!pool->queries[query].perf.kperfmon_ids[0])
+      return;
+
+   for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
+      struct drm_v3d_perfmon_destroy req = {
+         .id = pool->queries[query].perf.kperfmon_ids[i]
+      };
+
+      int ret = v3dv_ioctl(device->pdevice->render_fd,
+                           DRM_IOCTL_V3D_PERFMON_DESTROY,
+                           &req);
+
+      if (ret) {
+         fprintf(stderr, "Failed to destroy perfmon %u: %s\n",
+                 req.id, strerror(ret));
+      }
+   }
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateQueryPool(VkDevice _device,
                      const VkQueryPoolCreateInfo *pCreateInfo,
@@ -34,7 +176,8 @@ v3dv_CreateQueryPool(VkDevice _device,
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
 
    assert(pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION ||
-          pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP);
+          pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP ||
+          pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
    assert(pCreateInfo->queryCount > 0);
 
    struct v3dv_query_pool *pool =
@@ -46,6 +189,7 @@ v3dv_CreateQueryPool(VkDevice _device,
    pool->query_type = pCreateInfo->queryType;
    pool->query_count = pCreateInfo->queryCount;
 
+   uint32_t query_idx = 0;
    VkResult result;
 
    const uint32_t pool_bytes = sizeof(struct v3dv_query) * pool->query_count;
@@ -56,7 +200,8 @@ v3dv_CreateQueryPool(VkDevice _device,
       goto fail;
    }
 
-   if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
+   switch (pool->query_type) {
+   case VK_QUERY_TYPE_OCCLUSION: {
       /* The hardware allows us to setup groups of 16 queries in consecutive
        * 4-byte addresses, requiring only that each group of 16 queries is
        * aligned to a 1024 byte boundary.
@@ -72,22 +217,56 @@ v3dv_CreateQueryPool(VkDevice _device,
          result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
          goto fail;
       }
+      break;
    }
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+      const VkQueryPoolPerformanceCreateInfoKHR *pq_info =
+         vk_find_struct_const(pCreateInfo->pNext,
+                              QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
+
+      assert(pq_info);
+      assert(pq_info->counterIndexCount <= V3D_PERFCNT_NUM);
+
+      pool->perfmon.ncounters = pq_info->counterIndexCount;
+      for (uint32_t i = 0; i < pq_info->counterIndexCount; i++)
+         pool->perfmon.counters[i] = pq_info->pCounterIndices[i];
+
+      pool->perfmon.nperfmons = DIV_ROUND_UP(pool->perfmon.ncounters,
+                                             DRM_V3D_MAX_PERF_COUNTERS);
 
-   uint32_t i;
-   for (i = 0; i < pool->query_count; i++) {
-      pool->queries[i].maybe_available = false;
+      assert(pool->perfmon.nperfmons <= V3DV_MAX_PERFMONS);
+      break;
+   }
+   case VK_QUERY_TYPE_TIMESTAMP:
+      break;
+   default:
+      unreachable("Unsupported query type");
+   }
+
+   for (; query_idx < pool->query_count; query_idx++) {
+      pool->queries[query_idx].maybe_available = false;
       switch (pool->query_type) {
       case VK_QUERY_TYPE_OCCLUSION: {
-         const uint32_t query_group = i / 16;
-         const uint32_t query_offset = query_group * 1024 + (i % 16) * 4;
-         pool->queries[i].bo = pool->bo;
-         pool->queries[i].offset = query_offset;
+         const uint32_t query_group = query_idx / 16;
+         const uint32_t query_offset = query_group * 1024 + (query_idx % 16) * 4;
+         pool->queries[query_idx].bo = pool->bo;
+         pool->queries[query_idx].offset = query_offset;
          break;
          }
       case VK_QUERY_TYPE_TIMESTAMP:
-         pool->queries[i].value = 0;
+         pool->queries[query_idx].value = 0;
+         break;
+      case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+         result = vk_sync_create(&device->vk,
+                                 &device->pdevice->drm_syncobj_type, 0, 0,
+                                 &pool->queries[query_idx].perf.last_job_sync);
+         if (result != VK_SUCCESS)
+            goto fail;
+
+         for (uint32_t j = 0; j < pool->perfmon.nperfmons; j++)
+            pool->queries[query_idx].perf.kperfmon_ids[j] = 0;
          break;
+         }
       default:
          unreachable("Unsupported query type");
       }
@@ -98,6 +277,11 @@ v3dv_CreateQueryPool(VkDevice _device,
    return VK_SUCCESS;
 
 fail:
+   if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+      for (uint32_t j = 0; j < query_idx; j++)
+         vk_sync_destroy(&device->vk, pool->queries[j].perf.last_job_sync);
+   }
+
    if (pool->bo)
       v3dv_bo_free(device, pool->bo);
    if (pool->queries)
@@ -121,6 +305,13 @@ v3dv_DestroyQueryPool(VkDevice _device,
    if (pool->bo)
       v3dv_bo_free(device, pool->bo);
 
+   if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+      for (uint32_t i = 0; i < pool->query_count; i++) {
+         kperfmon_destroy(device, pool, i);
+         vk_sync_destroy(&device->vk, pool->queries[i].perf.last_job_sync);
+      }
+   }
+
    if (pool->queries)
       vk_free2(&device->vk.alloc, pAllocator, pool->queries);
 
@@ -128,7 +319,7 @@ v3dv_DestroyQueryPool(VkDevice _device,
 }
 
 static void
-write_query_result(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
+write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
 {
    if (do_64bit) {
       uint64_t *dst64 = (uint64_t *) dst;
@@ -177,13 +368,91 @@ query_wait_available(struct v3dv_device *device,
        !v3dv_bo_wait(device, q->bo, 0xffffffffffffffffull))
       return vk_device_set_lost(&device->vk, "Query BO wait failed: %m");
 
+   if (query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
+       vk_sync_wait(&device->vk, q->perf.last_job_sync,
+                    0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS)
+      return vk_device_set_lost(&device->vk, "Query job wait failed");
+
    return VK_SUCCESS;
 }
 
 static VkResult
-query_is_available(struct v3dv_device *device,
-                   struct v3dv_query *q,
-                   VkQueryType query_type)
+write_occlusion_query_result(struct v3dv_device *device,
+                             struct v3dv_query_pool *pool,
+                             uint32_t query,
+                             bool do_64bit,
+                             void *data,
+                             uint32_t slot)
+{
+   assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION);
+
+   if (vk_device_is_lost(&device->vk))
+      return VK_ERROR_DEVICE_LOST;
+
+   struct v3dv_query *q = &pool->queries[query];
+   assert(q->bo && q->bo->map);
+
+   const uint8_t *query_addr = ((uint8_t *) q->bo->map) + q->offset;
+   write_to_buffer(data, slot, do_64bit, (uint64_t) *((uint32_t *)query_addr));
+   return VK_SUCCESS;
+}
+
+static VkResult
+write_timestamp_query_result(struct v3dv_device *device,
+                             struct v3dv_query_pool *pool,
+                             uint32_t query,
+                             bool do_64bit,
+                             void *data,
+                             uint32_t slot)
+{
+   assert(pool && pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
+
+   struct v3dv_query *q = &pool->queries[query];
+
+   write_to_buffer(data, slot, do_64bit, q->value);
+   return VK_SUCCESS;
+}
+
+static VkResult
+write_performance_query_result(struct v3dv_device *device,
+                               struct v3dv_query_pool *pool,
+                               uint32_t query,
+                               bool do_64bit,
+                               void *data,
+                               uint32_t slot)
+{
+   assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
+
+   struct v3dv_query *q = &pool->queries[query];
+   uint64_t counter_values[V3D_PERFCNT_NUM];
+
+   for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
+      struct drm_v3d_perfmon_get_values req = {
+         .id = q->perf.kperfmon_ids[i],
+         .values_ptr = (uintptr_t)(&counter_values[i *
+                                   DRM_V3D_MAX_PERF_COUNTERS])
+      };
+
+      int ret = v3dv_ioctl(device->pdevice->render_fd,
+                           DRM_IOCTL_V3D_PERFMON_GET_VALUES,
+                           &req);
+
+      if (ret) {
+         fprintf(stderr, "failed to get perfmon values: %s\n", strerror(ret));
+         return vk_error(device, VK_ERROR_DEVICE_LOST);
+      }
+   }
+
+   for (uint32_t i = 0; i < pool->perfmon.ncounters; i++)
+      write_to_buffer(data, slot + i, do_64bit, counter_values[i]);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+query_check_available(struct v3dv_device *device,
+                      struct v3dv_query *q,
+                      VkQueryType query_type)
 {
    if (!q->maybe_available)
       return VK_NOT_READY;
@@ -192,70 +461,105 @@ query_is_available(struct v3dv_device *device,
        !v3dv_bo_wait(device, q->bo, 0))
       return VK_NOT_READY;
 
+   if (query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
+       vk_sync_wait(&device->vk, q->perf.last_job_sync,
+                    0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS)
+      return VK_NOT_READY;
+
    return VK_SUCCESS;
 }
 
 static VkResult
-get_query_result(struct v3dv_device *device,
-                 struct v3dv_query_pool *pool,
-                 uint32_t query,
-                 bool do_wait,
-                 bool *available,
-                 uint64_t *value)
+write_query_result(struct v3dv_device *device,
+                   struct v3dv_query_pool *pool,
+                   uint32_t query,
+                   bool do_64bit,
+                   void *data,
+                   uint32_t slot)
+{
+   switch (pool->query_type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+      return write_occlusion_query_result(device, pool, query, do_64bit,
+                                          data, slot);
+   case VK_QUERY_TYPE_TIMESTAMP:
+      return write_timestamp_query_result(device, pool, query, do_64bit,
+                                          data, slot);
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+      return write_performance_query_result(device, pool, query, do_64bit,
+                                            data, slot);
+   default:
+      unreachable("Unsupported query type");
+   }
+}
+
+static VkResult
+query_is_available(struct v3dv_device *device,
+                   struct v3dv_query_pool *pool,
+                   uint32_t query,
+                   bool do_wait,
+                   bool *available)
 {
    struct v3dv_query *q = &pool->queries[query];
 
+   assert(pool->query_type != VK_QUERY_TYPE_OCCLUSION ||
+          (q->bo && q->bo->map));
+
    if (do_wait) {
       VkResult result = query_wait_available(device, q, pool->query_type);
-      if (result != VK_SUCCESS)
+      if (result != VK_SUCCESS) {
+         *available = false;
          return result;
+      }
 
       *available = true;
    } else {
-      VkResult result = query_is_available(device, q, pool->query_type);
+      VkResult result = query_check_available(device, q, pool->query_type);
       assert(result == VK_SUCCESS || result == VK_NOT_READY);
       *available = (result == VK_SUCCESS);
    }
 
-   switch (pool->query_type) {
-   case VK_QUERY_TYPE_OCCLUSION: {
-      const uint8_t *query_addr = ((uint8_t *) q->bo->map) + q->offset;
-      *value = (uint64_t) *((uint32_t *)query_addr);
-      return VK_SUCCESS;
-   }
+   return VK_SUCCESS;
+}
 
+static uint32_t
+get_query_result_count(struct v3dv_query_pool *pool)
+{
+   switch (pool->query_type) {
+   case VK_QUERY_TYPE_OCCLUSION:
    case VK_QUERY_TYPE_TIMESTAMP:
-      *value = q->value;
-      return VK_SUCCESS;
-
+      return 1;
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+      return pool->perfmon.ncounters;
    default:
       unreachable("Unsupported query type");
    }
 }
 
 VkResult
-v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
-                                struct v3dv_query_pool *pool,
-                                uint32_t first,
-                                uint32_t count,
-                                void *data,
-                                VkDeviceSize stride,
-                                VkQueryResultFlags flags)
+v3dv_get_query_pool_results(struct v3dv_device *device,
+                            struct v3dv_query_pool *pool,
+                            uint32_t first,
+                            uint32_t count,
+                            void *data,
+                            VkDeviceSize stride,
+                            VkQueryResultFlags flags)
 {
    assert(first < pool->query_count);
    assert(first + count <= pool->query_count);
    assert(data);
 
-   const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT;
+   const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT ||
+      pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR;
    const bool do_wait = flags & VK_QUERY_RESULT_WAIT_BIT;
    const bool do_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;
 
+   uint32_t result_count = get_query_result_count(pool);
+
    VkResult result = VK_SUCCESS;
    for (uint32_t i = first; i < first + count; i++) {
       bool available = false;
-      uint64_t value = 0;
       VkResult query_result =
-         get_query_result(device, pool, i, do_wait, &available, &value);
+         query_is_available(device, pool, i, do_wait, &available);
       if (query_result == VK_ERROR_DEVICE_LOST)
          result = VK_ERROR_DEVICE_LOST;
 
@@ -273,11 +577,11 @@ v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
 
       const bool write_result = available || do_partial;
       if (write_result)
-         write_query_result(data, slot, do_64bit, value);
-      slot++;
+         write_query_result(device, pool, i, do_64bit, data, slot);
+      slot += result_count;
 
       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
-         write_query_result(data, slot++, do_64bit, available ? 1u : 0u);
+         write_to_buffer(data, slot++, do_64bit, available ? 1u : 0u);
 
       if (!write_result && result != VK_ERROR_DEVICE_LOST)
          result = VK_NOT_READY;
@@ -301,8 +605,8 @@ v3dv_GetQueryPoolResults(VkDevice _device,
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
    V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
 
-   return v3dv_get_query_pool_results_cpu(device, pool, firstQuery, queryCount,
-                                          pData, stride, flags);
+   return v3dv_get_query_pool_results(device, pool, firstQuery, queryCount,
+                                      pData, stride, flags);
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -381,6 +685,12 @@ v3dv_reset_query_pools(struct v3dv_device *device,
       case VK_QUERY_TYPE_TIMESTAMP:
          q->value = 0;
          break;
+      case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+         kperfmon_destroy(device, pool, i);
+         kperfmon_create(device, pool, i);
+         if (vk_sync_reset(&device->vk, q->perf.last_job_sync) != VK_SUCCESS)
+            fprintf(stderr, "Failed to reset sync");
+         break;
       default:
          unreachable("Unsupported query type");
       }
@@ -400,3 +710,69 @@ v3dv_ResetQueryPool(VkDevice _device,
 
    v3dv_reset_query_pools(device, pool, firstQuery, queryCount);
 }
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
+   VkPhysicalDevice physicalDevice,
+   uint32_t queueFamilyIndex,
+   uint32_t *pCounterCount,
+   VkPerformanceCounterKHR *pCounters,
+   VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
+{
+   uint32_t desc_count = *pCounterCount;
+
+   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
+                          out, pCounters, pCounterCount);
+   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
+                          out_desc, pCounterDescriptions, &desc_count);
+
+   for (int i = 0; i < ARRAY_SIZE(v3dv_counters); i++) {
+      vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
+         counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
+         counter->scope = VK_QUERY_SCOPE_COMMAND_KHR;
+         counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
+
+         unsigned char sha1_result[20];
+         _mesa_sha1_compute(v3dv_counters[i][1], strlen(v3dv_counters[i][1]),
+                            sha1_result);
+
+         memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
+      }
+
+      vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
+                               &out_desc, desc) {
+         desc->flags = 0;
+         snprintf(desc->name, sizeof(desc->name), "%s",
+            v3dv_counters[i][1]);
+         snprintf(desc->category, sizeof(desc->category), "%s",
+            v3dv_counters[i][0]);
+         snprintf(desc->description, sizeof(desc->description), "%s",
+            v3dv_counters[i][2]);
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
+   VkPhysicalDevice physicalDevice,
+   const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo,
+   uint32_t *pNumPasses)
+{
+   *pNumPasses = DIV_ROUND_UP(pPerformanceQueryCreateInfo->counterIndexCount,
+                              DRM_V3D_MAX_PERF_COUNTERS);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+v3dv_AcquireProfilingLockKHR(
+   VkDevice _device,
+   const VkAcquireProfilingLockInfoKHR *pInfo)
+{
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_ReleaseProfilingLockKHR(VkDevice device)
+{
+}
diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
index 799139b9174..a3d92466d88 100644
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -137,27 +137,129 @@ handle_reset_query_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job,
    if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION)
       v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE);
 
+   if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+      struct vk_sync_wait waits[info->count];
+      unsigned wait_count = 0;
+      for (int i = 0; i < info->count; i++) {
+         struct v3dv_query *query = &info->pool->queries[i];
+         /* Only wait for a query if we've used it otherwise we will be
+          * waiting forever for the fence to become signaled.
+          */
+         if (query->maybe_available) {
+            waits[wait_count] = (struct vk_sync_wait){
+               .sync = info->pool->queries[i].perf.last_job_sync
+            };
+            wait_count++;
+         };
+      }
+
+      VkResult result = vk_sync_wait_many(&job->device->vk, wait_count, waits,
+                                          VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
+
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
    v3dv_reset_query_pools(job->device, info->pool, info->first, info->count);
 
    return VK_SUCCESS;
 }
 
 static VkResult
-handle_end_query_cpu_job(struct v3dv_job *job)
+export_perfmon_last_job_sync(struct v3dv_queue *queue, struct v3dv_job *job, int *fd)
+{
+   int err;
+   if (job->device->pdevice->caps.multisync) {
+      static const enum v3dv_queue_type queues_to_sync[] = {
+         V3DV_QUEUE_CL,
+         V3DV_QUEUE_CSD,
+      };
+
+      for (uint32_t i = 0; i < ARRAY_SIZE(queues_to_sync); i++) {
+         enum v3dv_queue_type queue_type = queues_to_sync[i];
+         int tmp_fd = -1;
+
+         err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd,
+                                        queue->last_job_syncs.syncs[queue_type],
+                                        &tmp_fd);
+
+         if (err) {
+            close(*fd);
+            return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
+                             "sync file export failed: %m");
+         }
+
+         err = sync_accumulate("v3dv", fd, tmp_fd);
+
+         if (err) {
+            close(tmp_fd);
+            close(*fd);
+            return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
+                             "failed to accumulate sync files: %m");
+         }
+      }
+   } else {
+      err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd,
+                                     queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
+                                     fd);
+
+      if (err) {
+         return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
+                          "sync file export failed: %m");
+      }
+   }
+   return VK_SUCCESS;
+}
+
+static VkResult
+handle_end_query_cpu_job(struct v3dv_job *job, uint32_t counter_pass_idx)
 {
+   VkResult result = VK_SUCCESS;
+
    mtx_lock(&job->device->query_mutex);
 
    struct v3dv_end_query_cpu_job_info *info = &job->cpu.query_end;
+   struct v3dv_queue *queue = &job->device->queue;
+
+   int err = 0;
+   int fd = -1;
+
+   if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+      result = export_perfmon_last_job_sync(queue, job, &fd);
+
+      if (result != VK_SUCCESS)
+         goto fail;
+
+      assert(fd >= 0);
+   }
+
    for (uint32_t i = 0; i < info->count; i++) {
       assert(info->query + i < info->pool->query_count);
       struct v3dv_query *query = &info->pool->queries[info->query + i];
+
+      if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+         uint32_t syncobj = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
+         err = drmSyncobjImportSyncFile(job->device->pdevice->render_fd,
+                                        syncobj, fd);
+
+         if (err) {
+            result = vk_errorf(queue, VK_ERROR_UNKNOWN,
+                               "sync file import failed: %m");
+            goto fail;
+         }
+      }
+
       query->maybe_available = true;
    }
 
+fail:
+   if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
+      close(fd);
+
    cnd_broadcast(&job->device->query_ended);
    mtx_unlock(&job->device->query_mutex);
 
-   return VK_SUCCESS;
+   return result;
 }
 
 static VkResult
@@ -176,13 +278,13 @@ handle_copy_query_results_cpu_job(struct v3dv_job *job)
 
    uint8_t *offset = ((uint8_t *) bo->map) +
                      info->offset + info->dst->mem_offset;
-   v3dv_get_query_pool_results_cpu(job->device,
-                                   info->pool,
-                                   info->first,
-                                   info->count,
-                                   offset,
-                                   info->stride,
-                                   info->flags);
+   v3dv_get_query_pool_results(job->device,
+                               info->pool,
+                               info->first,
+                               info->count,
+                               offset,
+                               info->stride,
+                               info->flags);
 
    return VK_SUCCESS;
 }
@@ -635,6 +737,7 @@ fail:
 static VkResult
 handle_cl_job(struct v3dv_queue *queue,
               struct v3dv_job *job,
+              uint32_t counter_pass_idx,
               struct v3dv_submit_sync_info *sync_info,
               bool signal_syncs)
 {
@@ -678,9 +781,15 @@ handle_cl_job(struct v3dv_queue *queue,
    assert(bo_idx == submit.bo_handle_count);
    submit.bo_handles = (uintptr_t)(void *)bo_handles;
 
+   submit.perfmon_id = job->perf ?
+      job->perf->kperfmon_ids[counter_pass_idx] : 0;
+   const bool needs_perf_sync = queue->last_perfmon_id != submit.perfmon_id;
+   queue->last_perfmon_id = submit.perfmon_id;
+
    /* We need a binning sync if we are waiting on a semaphore with a wait stage
     * that involves the geometry pipeline, or if the job comes after a pipeline
-    * barrier that involves geometry stages (needs_bcl_sync).
+    * barrier that involves geometry stages (needs_bcl_sync), or if
+    * performance queries are in use.
     *
     * We need a render sync if the job doesn't need a binning sync but has
     * still been flagged for serialization. It should be noted that RCL jobs
@@ -705,6 +814,7 @@ handle_cl_job(struct v3dv_queue *queue,
           VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
           VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT);
    }
+   needs_bcl_sync |= needs_perf_sync;
    bool needs_rcl_sync = job->serialize && !needs_bcl_sync;
 
    /* Replace single semaphore settings whenever our kernel-driver supports
@@ -795,6 +905,7 @@ handle_tfu_job(struct v3dv_queue *queue,
 static VkResult
 handle_csd_job(struct v3dv_queue *queue,
                struct v3dv_job *job,
+               uint32_t counter_pass_idx,
                struct v3dv_submit_sync_info *sync_info,
                bool signal_syncs)
 {
@@ -835,6 +946,9 @@ handle_csd_job(struct v3dv_queue *queue,
       submit->in_sync = needs_sync ? last_job_sync : 0;
       submit->out_sync = last_job_sync;
    }
+   submit->perfmon_id = job->perf ?
+      job->perf->kperfmon_ids[counter_pass_idx] : 0;
+   queue->last_perfmon_id = submit->perfmon_id;
    int ret = v3dv_ioctl(device->pdevice->render_fd,
                         DRM_IOCTL_V3D_SUBMIT_CSD, submit);
 
@@ -858,20 +972,21 @@ handle_csd_job(struct v3dv_queue *queue,
 static VkResult
 queue_handle_job(struct v3dv_queue *queue,
                  struct v3dv_job *job,
+                 uint32_t counter_pass_idx,
                  struct v3dv_submit_sync_info *sync_info,
                  bool signal_syncs)
 {
    switch (job->type) {
    case V3DV_JOB_TYPE_GPU_CL:
-      return handle_cl_job(queue, job, sync_info, signal_syncs);
+      return handle_cl_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
    case V3DV_JOB_TYPE_GPU_TFU:
       return handle_tfu_job(queue, job, sync_info, signal_syncs);
    case V3DV_JOB_TYPE_GPU_CSD:
-      return handle_csd_job(queue, job, sync_info, signal_syncs);
+      return handle_csd_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
    case V3DV_JOB_TYPE_CPU_RESET_QUERIES:
       return handle_reset_query_cpu_job(queue, job, sync_info);
    case V3DV_JOB_TYPE_CPU_END_QUERY:
-      return handle_end_query_cpu_job(job);
+      return handle_end_query_cpu_job(job, counter_pass_idx);
    case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS:
       return handle_copy_query_results_cpu_job(job);
    case V3DV_JOB_TYPE_CPU_SET_EVENT:
@@ -913,6 +1028,7 @@ queue_create_noop_job(struct v3dv_queue *queue)
 
 static VkResult
 queue_submit_noop_job(struct v3dv_queue *queue,
+                      uint32_t counter_pass_idx,
                       struct v3dv_submit_sync_info *sync_info,
                       bool signal_syncs)
 {
@@ -923,7 +1039,8 @@ queue_submit_noop_job(struct v3dv_queue *queue,
    }
 
    assert(queue->noop_job);
-   return queue_handle_job(queue, queue->noop_job, sync_info, signal_syncs);
+   return queue_handle_job(queue, queue->noop_job, counter_pass_idx,
+                           sync_info, signal_syncs);
 }
 
 VkResult
@@ -953,7 +1070,8 @@ v3dv_queue_driver_submit(struct vk_queue *vk_queue,
       list_for_each_entry_safe(struct v3dv_job, job,
                                &cmd_buffer->jobs, list_link) {
 
-         result = queue_handle_job(queue, job, &sync_info, false);
+         result = queue_handle_job(queue, job, submit->perf_pass_index,
+                                   &sync_info, false);
          if (result != VK_SUCCESS)
             return result;
       }
@@ -964,7 +1082,8 @@ v3dv_queue_driver_submit(struct vk_queue *vk_queue,
        * barrier state to limit the queues we serialize against.
        */
       if (cmd_buffer->state.barrier.dst_mask) {
-         result = queue_submit_noop_job(queue, &sync_info, false);
+         result = queue_submit_noop_job(queue, submit->perf_pass_index,
+                                        &sync_info, false);
          if (result != VK_SUCCESS)
             return result;
       }
@@ -976,7 +1095,8 @@ v3dv_queue_driver_submit(struct vk_queue *vk_queue,
     * requirements.
     */
    if (submit->signal_count > 0) {
-      result = queue_submit_noop_job(queue, &sync_info, true);
+      result = queue_submit_noop_job(queue, submit->perf_pass_index,
+                                     &sync_info, true);
       if (result != VK_SUCCESS)
          return result;
    }