Mesa (master): turnip: Implement VK_KHR_performance_query
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Tue Dec 22 05:08:06 UTC 2020
Module: Mesa
Branch: master
Commit: 937dd76426b2b372a18be35e1416eed291524af7
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=937dd76426b2b372a18be35e1416eed291524af7
Author: Hyunjun Ko <zzoon at igalia.com>
Date: Fri Nov 20 05:32:27 2020 +0000
turnip: Implement VK_KHR_performance_query
There are still some commands unimplemented yet.
- vkGetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR:
The following patch supports this.
- vkAcquireProfilingLockKHR / vkReleaseProfilingLock
This patch supports only monitoring perf counters for each submit.
To reserve/configure counters across submits we would need a kernel
interface to be able to do that.
Signed-off-by: Hyunjun Ko <zzoon at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6808>
---
src/freedreno/vulkan/meson.build | 1 +
src/freedreno/vulkan/tu_device.c | 14 ++
src/freedreno/vulkan/tu_extensions.py | 1 +
src/freedreno/vulkan/tu_private.h | 7 +
src/freedreno/vulkan/tu_query.c | 353 ++++++++++++++++++++++++++++++++--
5 files changed, 358 insertions(+), 18 deletions(-)
diff --git a/src/freedreno/vulkan/meson.build b/src/freedreno/vulkan/meson.build
index 3e22ab4d7b9..ef5912a39ed 100644
--- a/src/freedreno/vulkan/meson.build
+++ b/src/freedreno/vulkan/meson.build
@@ -132,6 +132,7 @@ libvulkan_freedreno = shared_library(
tu_link_with,
libfreedreno_ir3,
libfreedreno_layout,
+ libfreedreno_perfcntrs,
],
dependencies : [
idep_libfreedreno_common,
diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c
index 0fff5daec89..900c2090599 100644
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@@ -614,6 +614,14 @@ tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
features->extendedDynamicState = true;
break;
}
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_FEATURES_KHR: {
+ VkPhysicalDevicePerformanceQueryFeaturesKHR *feature =
+ (VkPhysicalDevicePerformanceQueryFeaturesKHR *)ext;
+ feature->performanceCounterQueryPools = true;
+ feature->performanceCounterMultipleQueryPools = false;
+ break;
+ }
+
default:
break;
}
@@ -867,6 +875,12 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
props->supportedStencilResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT;
break;
}
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_PROPERTIES_KHR: {
+ VkPhysicalDevicePerformanceQueryPropertiesKHR *properties =
+ (VkPhysicalDevicePerformanceQueryPropertiesKHR *)ext;
+ properties->allowCommandBufferQueryCopies = false;
+ break;
+ }
default:
break;
}
diff --git a/src/freedreno/vulkan/tu_extensions.py b/src/freedreno/vulkan/tu_extensions.py
index a47d701eb55..304399e7e9c 100644
--- a/src/freedreno/vulkan/tu_extensions.py
+++ b/src/freedreno/vulkan/tu_extensions.py
@@ -101,6 +101,7 @@ EXTENSIONS = [
Extension('VK_KHR_incremental_present', 1, 'TU_HAS_SURFACE'),
Extension('VK_KHR_image_format_list', 1, True),
Extension('VK_KHR_depth_stencil_resolve', 1, True),
+ Extension('VK_KHR_performance_query', 1, False),
]
MAX_API_VERSION = VkVersion(MAX_API_VERSION)
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h
index e48f88baeac..672ada602d5 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -64,6 +64,7 @@
#include "a6xx.xml.h"
#include "fdl/freedreno_layout.h"
#include "common/freedreno_dev_info.h"
+#include "perfcntrs/freedreno_perfcntr.h"
#include "tu_descriptor_set.h"
#include "tu_extensions.h"
@@ -1511,6 +1512,12 @@ struct tu_query_pool
uint64_t size;
uint32_t pipeline_statistics;
struct tu_bo bo;
+
+ /* For performance query */
+ const struct fd_perfcntr_group *perf_group;
+ uint32_t perf_group_count;
+ uint32_t counter_index_count;
+ uint32_t counter_indices[0];
};
uint32_t
diff --git a/src/freedreno/vulkan/tu_query.c b/src/freedreno/vulkan/tu_query.c
index 083de39e574..fa392a25fc9 100644
--- a/src/freedreno/vulkan/tu_query.c
+++ b/src/freedreno/vulkan/tu_query.c
@@ -39,6 +39,7 @@
#include "util/os_time.h"
#include "tu_cs.h"
+#include "vk_util.h"
#define NSEC_PER_SEC 1000000000ull
#define WAIT_TIMEOUT 5
@@ -96,6 +97,17 @@ struct PACKED primitive_query_slot {
struct primitive_slot_value end[4];
};
+struct PACKED perfcntr_query_slot {
+ uint64_t result;
+ uint64_t begin;
+ uint64_t end;
+};
+
+struct PACKED perf_query_slot {
+ struct query_slot common;
+ struct perfcntr_query_slot perfcntr;
+};
+
/* Returns the IOVA of a given uint64_t field in a given slot of a query
* pool. */
#define query_iova(type, pool, query, field) \
@@ -112,19 +124,62 @@ struct PACKED primitive_query_slot {
query_iova(struct primitive_query_slot, pool, query, field) + \
offsetof(struct primitive_slot_value, values[i])
+#define perf_query_iova(pool, query, field, i) \
+ pool->bo.iova + pool->stride * query + \
+ sizeof(struct query_slot) + \
+ sizeof(struct perfcntr_query_slot) * i + \
+ offsetof(struct perfcntr_query_slot, field)
+
#define query_available_iova(pool, query) \
query_iova(struct query_slot, pool, query, available)
-#define query_result_iova(pool, query, i) \
+#define query_result_iova(pool, query, type, i) \
pool->bo.iova + pool->stride * (query) + \
- sizeof(struct query_slot) + sizeof(uint64_t) * i
+ sizeof(struct query_slot) + sizeof(type) * i
-#define query_result_addr(pool, query, i) \
+#define query_result_addr(pool, query, type, i) \
pool->bo.map + pool->stride * query + \
- sizeof(struct query_slot) + sizeof(uint64_t) * i
+ sizeof(struct query_slot) + sizeof(type) * i
#define query_is_available(slot) slot->available
+static const VkPerformanceCounterUnitKHR
+fd_perfcntr_type_to_vk_unit[] = {
+ [FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+ [FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+ [FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+ [FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR,
+ [FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR,
+ /* TODO. can be UNIT_NANOSECONDS_KHR with a logic to compute */
+ [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+ [FD_PERFCNTR_TYPE_HZ] = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR,
+ [FD_PERFCNTR_TYPE_DBM] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+ [FD_PERFCNTR_TYPE_TEMPERATURE] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+ [FD_PERFCNTR_TYPE_VOLTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+ [FD_PERFCNTR_TYPE_AMPS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+ [FD_PERFCNTR_TYPE_WATTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+};
+
+/* TODO. Basically this comes from the freedreno implementation where
+ * only UINT64 is used. We'd better confirm this by the blob vulkan driver
+ * when it starts supporting perf query.
+ */
+static const VkPerformanceCounterStorageKHR
+fd_perfcntr_type_to_vk_storage[] = {
+ [FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR,
+ [FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
+ [FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
+ [FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
+ [FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
+ [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
+ [FD_PERFCNTR_TYPE_HZ] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
+ [FD_PERFCNTR_TYPE_DBM] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
+ [FD_PERFCNTR_TYPE_TEMPERATURE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
+ [FD_PERFCNTR_TYPE_VOLTS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
+ [FD_PERFCNTR_TYPE_AMPS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
+ [FD_PERFCNTR_TYPE_WATTS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
+};
+
/*
* Returns a pointer to a given slot in a query pool.
*/
@@ -133,6 +188,32 @@ static void* slot_address(struct tu_query_pool *pool, uint32_t query)
return (char*)pool->bo.map + query * pool->stride;
}
+static void
+perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count,
+ uint32_t index, uint32_t *gid, uint32_t *cid)
+
+{
+ uint32_t i;
+
+ /* TODO. we should handle multipass to be able to get all countables.
+ * Until then apps can only the first n countables where n == num_counters.
+ *
+ * See tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR.
+ */
+ for (i = 0; i < group_count; i++) {
+ if (group[i].num_counters > index) {
+ *gid = i;
+ *cid = index;
+ break;
+ }
+ index -= group[i].num_counters;
+
+ assert(index >= 0);
+ }
+
+ assert(i < group_count);
+}
+
VkResult
tu_CreateQueryPool(VkDevice _device,
const VkQueryPoolCreateInfo *pCreateInfo,
@@ -143,7 +224,11 @@ tu_CreateQueryPool(VkDevice _device,
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
assert(pCreateInfo->queryCount > 0);
- uint32_t slot_size;
+ uint32_t pool_size, slot_size;
+ const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
+
+ pool_size = sizeof(struct tu_query_pool);
+
switch (pCreateInfo->queryType) {
case VK_QUERY_TYPE_OCCLUSION:
slot_size = sizeof(struct occlusion_query_slot);
@@ -154,6 +239,20 @@ tu_CreateQueryPool(VkDevice _device,
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
slot_size = sizeof(struct primitive_query_slot);
break;
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+ perf_query_info =
+ vk_find_struct_const(pCreateInfo->pNext,
+ QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
+ assert(perf_query_info);
+
+ slot_size = sizeof(struct perf_query_slot) +
+ sizeof(struct perfcntr_query_slot) *
+ (perf_query_info->counterIndexCount - 1);
+
+ /* Size of the array pool->counter_indices */
+ pool_size += sizeof(uint32_t) * perf_query_info->counterIndexCount;
+ break;
+ }
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
slot_size = sizeof(struct pipeline_stat_query_slot);
break;
@@ -162,11 +261,21 @@ tu_CreateQueryPool(VkDevice _device,
}
struct tu_query_pool *pool =
- vk_object_alloc(&device->vk, pAllocator, sizeof(*pool),
+ vk_object_alloc(&device->vk, pAllocator, pool_size,
VK_OBJECT_TYPE_QUERY_POOL);
if (!pool)
return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ if (pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ pool->perf_group = fd_perfcntrs(device->physical_device->gpu_id,
+ &pool->perf_group_count);
+
+ pool->counter_index_count = perf_query_info->counterIndexCount;
+
+ for (uint32_t i = 0; i < pool->counter_index_count; i++)
+ pool->counter_indices[i] = perf_query_info->pCounterIndices[i];
+ }
+
VkResult result = tu_bo_init_new(device, &pool->bo,
pCreateInfo->queryCount * slot_size, false);
if (result != VK_SUCCESS) {
@@ -221,6 +330,8 @@ get_result_count(struct tu_query_pool *pool)
return 2;
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
return util_bitcount(pool->pipeline_statistics);
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+ return pool->counter_index_count;
default:
assert(!"Invalid query type");
return 0;
@@ -341,9 +452,11 @@ get_query_pool_results(struct tu_device *device,
if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
uint32_t stat_idx = statistics_index(&statistics);
- result = query_result_addr(pool, query, stat_idx);
+ result = query_result_addr(pool, query, uint64_t, stat_idx);
+ } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ result = query_result_addr(pool, query, struct perfcntr_query_slot, k);
} else {
- result = query_result_addr(pool, query, k);
+ result = query_result_addr(pool, query, uint64_t, k);
}
write_query_value_cpu(result_base, k, *result, flags);
@@ -396,6 +509,7 @@ tu_GetQueryPoolResults(VkDevice _device,
case VK_QUERY_TYPE_TIMESTAMP:
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
return get_query_pool_results(device, pool, firstQuery, queryCount,
dataSize, pData, stride, flags);
default:
@@ -470,9 +584,12 @@ emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
uint32_t stat_idx = statistics_index(&statistics);
- result_iova = query_result_iova(pool, query, stat_idx);
+ result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
+ } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ result_iova = query_result_iova(pool, query,
+ struct perfcntr_query_slot, k);
} else {
- result_iova = query_result_iova(pool, query, k);
+ result_iova = query_result_iova(pool, query, uint64_t, k);
}
if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
@@ -535,6 +652,8 @@ tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery,
queryCount, buffer, dstOffset, stride, flags);
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+ unreachable("allowCommandBufferQueryCopies is false");
default:
assert(!"Invalid query type");
}
@@ -561,9 +680,12 @@ emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
uint32_t stat_idx = statistics_index(&statistics);
- result_iova = query_result_iova(pool, query, stat_idx);
+ result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
+ } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ result_iova = query_result_iova(pool, query,
+ struct perfcntr_query_slot, k);
} else {
- result_iova = query_result_iova(pool, query, k);
+ result_iova = query_result_iova(pool, query, uint64_t, k);
}
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
@@ -588,6 +710,7 @@ tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
case VK_QUERY_TYPE_OCCLUSION:
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
break;
default:
@@ -608,7 +731,15 @@ tu_ResetQueryPool(VkDevice device,
slot->available = 0;
for (uint32_t k = 0; k < get_result_count(pool); k++) {
- uint64_t *res = query_result_addr(pool, i + firstQuery, k);
+ uint64_t *res;
+
+ if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+ res = query_result_addr(pool, i + firstQuery,
+ struct perfcntr_query_slot, k);
+ } else {
+ res = query_result_addr(pool, i + firstQuery, uint64_t, k);
+ }
+
*res = 0;
}
}
@@ -667,6 +798,46 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
tu_cs_emit_qw(cs, begin_iova);
}
+static void
+emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
+ struct tu_query_pool *pool,
+ uint32_t query)
+{
+ struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
+ uint32_t gid = 0, cid = 0;
+
+ tu_cs_emit_wfi(cs);
+
+ for (uint32_t i = 0; i < pool->counter_index_count; i++) {
+ perfcntr_index(pool->perf_group, pool->perf_group_count,
+ pool->counter_indices[i], &gid, &cid);
+
+ const struct fd_perfcntr_counter *counter =
+ &pool->perf_group[gid].counters[cid];
+ const struct fd_perfcntr_countable *countable =
+ &pool->perf_group[gid].countables[cid];
+
+ tu_cs_emit_pkt4(cs, counter->select_reg, 1);
+ tu_cs_emit(cs, countable->selector);
+ }
+
+ tu_cs_emit_wfi(cs);
+
+ for (uint32_t i = 0; i < pool->counter_index_count; i++) {
+ perfcntr_index(pool->perf_group, pool->perf_group_count,
+ pool->counter_indices[i], &gid, &cid);
+
+ const struct fd_perfcntr_counter *counter =
+ &pool->perf_group[gid].counters[cid];
+ uint64_t begin_iova = perf_query_iova(pool, query, begin, i);
+
+ tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
+ tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
+ CP_REG_TO_MEM_0_64B);
+ tu_cs_emit_qw(cs, begin_iova);
+ }
+}
+
static void
emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
struct tu_query_pool *pool,
@@ -701,6 +872,9 @@ tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
emit_begin_xfb_query(cmdbuf, pool, query, 0);
break;
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+ emit_begin_perf_query(cmdbuf, pool, query);
+ break;
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
emit_begin_stat_query(cmdbuf, pool, query);
break;
@@ -756,7 +930,7 @@ emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
uint64_t available_iova = query_available_iova(pool, query);
uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
uint64_t end_iova = occlusion_query_iova(pool, query, end);
- uint64_t result_iova = query_result_iova(pool, query, 0);
+ uint64_t result_iova = query_result_iova(pool, query, uint64_t, 0);
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
tu_cs_emit_qw(cs, end_iova);
tu_cs_emit_qw(cs, 0xffffffffffffffffull);
@@ -829,7 +1003,7 @@ emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
tu_cs_emit_qw(cs, end_iova);
for (int i = 0; i < STAT_COUNT; i++) {
- result_iova = query_result_iova(pool, query, i);
+ result_iova = query_result_iova(pool, query, uint64_t, i);
stat_start_iova = pipeline_stat_query_iova(pool, query, begin[i]);
stat_stop_iova = pipeline_stat_query_iova(pool, query, end[i]);
@@ -855,6 +1029,67 @@ emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
tu_cs_emit_qw(cs, 0x1);
}
+static void
+emit_end_perf_query(struct tu_cmd_buffer *cmdbuf,
+ struct tu_query_pool *pool,
+ uint32_t query)
+{
+ struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
+ uint64_t begin_iova;
+ uint64_t end_iova;
+ uint64_t available_iova = query_available_iova(pool, query);
+ uint64_t result_iova;
+ uint32_t gid = 0, cid = 0;
+
+ tu_cs_emit_wfi(cs);
+
+ for (uint32_t i = 0; i < pool->counter_index_count; i++) {
+ perfcntr_index(pool->perf_group, pool->perf_group_count,
+ pool->counter_indices[i], &gid, &cid);
+
+ const struct fd_perfcntr_counter *counter =
+ &pool->perf_group[gid].counters[cid];
+ end_iova = perf_query_iova(pool, query, end, i);
+
+ tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
+ tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
+ CP_REG_TO_MEM_0_64B);
+ tu_cs_emit_qw(cs, end_iova);
+ }
+
+ tu_cs_emit_wfi(cs);
+
+ for (uint32_t i = 0; i < pool->counter_index_count; i++) {
+ perfcntr_index(pool->perf_group, pool->perf_group_count,
+ pool->counter_indices[i], &gid, &cid);
+
+ result_iova = query_result_iova(pool, query,
+ struct perfcntr_query_slot, i);
+ begin_iova = perf_query_iova(pool, query, begin, i);
+ end_iova = perf_query_iova(pool, query, end, i);
+
+ tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
+ tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
+ CP_MEM_TO_MEM_0_DOUBLE |
+ CP_MEM_TO_MEM_0_NEG_C);
+
+ tu_cs_emit_qw(cs, result_iova);
+ tu_cs_emit_qw(cs, result_iova);
+ tu_cs_emit_qw(cs, end_iova);
+ tu_cs_emit_qw(cs, begin_iova);
+ }
+
+ tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
+
+ if (cmdbuf->state.pass)
+ cs = &cmdbuf->draw_epilogue_cs;
+
+ /* Set the availability to 1 */
+ tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
+ tu_cs_emit_qw(cs, available_iova);
+ tu_cs_emit_qw(cs, 0x1);
+}
+
static void
emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
struct tu_query_pool *pool,
@@ -864,8 +1099,8 @@ emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
uint64_t end_iova = primitive_query_iova(pool, query, end[0], 0);
- uint64_t result_written_iova = query_result_iova(pool, query, 0);
- uint64_t result_generated_iova = query_result_iova(pool, query, 1);
+ uint64_t result_written_iova = query_result_iova(pool, query, uint64_t, 0);
+ uint64_t result_generated_iova = query_result_iova(pool, query, uint64_t, 1);
uint64_t begin_written_iova = primitive_query_iova(pool, query, begin[stream_id], 0);
uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin[stream_id], 1);
uint64_t end_written_iova = primitive_query_iova(pool, query, end[stream_id], 0);
@@ -958,6 +1193,9 @@ tu_CmdEndQuery(VkCommandBuffer commandBuffer,
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
emit_end_xfb_query(cmdbuf, pool, query, 0);
break;
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+ emit_end_perf_query(cmdbuf, pool, query);
+ break;
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
emit_end_stat_query(cmdbuf, pool, query);
break;
@@ -1029,7 +1267,7 @@ tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO) |
CP_REG_TO_MEM_0_CNT(2) |
CP_REG_TO_MEM_0_64B);
- tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
+ tu_cs_emit_qw(cs, query_result_iova(pool, query, uint64_t, 0));
/* Only flag availability once the entire renderpass is done, similar to
* the begin/end path.
@@ -1067,3 +1305,82 @@ tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
*/
handle_multiview_queries(cmd, pool, query);
}
+
+VkResult
+tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
+ VkPhysicalDevice physicalDevice,
+ uint32_t queueFamilyIndex,
+ uint32_t* pCounterCount,
+ VkPerformanceCounterKHR* pCounters,
+ VkPerformanceCounterDescriptionKHR* pCounterDescriptions)
+{
+ TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
+
+ uint32_t desc_count = *pCounterCount;
+ uint32_t group_count;
+ const struct fd_perfcntr_group *group =
+ fd_perfcntrs(phydev->gpu_id, &group_count);
+
+ VK_OUTARRAY_MAKE(out, pCounters, pCounterCount);
+ VK_OUTARRAY_MAKE(out_desc, pCounterDescriptions, &desc_count);
+
+ /* TODO. we should handle multipass to be able to get all countables.
+ * Until then apps can only the first n countables where n == num_counters.
+ */
+ for (int i = 0; i < group_count; i++) {
+ for (int j = 0; j < group[i].num_counters; j++) {
+
+ vk_outarray_append(&out, counter) {
+ counter->scope = VK_QUERY_SCOPE_COMMAND_BUFFER_KHR;
+ counter->unit =
+ fd_perfcntr_type_to_vk_unit[group[i].countables[j].query_type];
+ counter->storage =
+ fd_perfcntr_type_to_vk_storage[group[i].countables[j].query_type];
+
+ unsigned char sha1_result[20];
+ _mesa_sha1_compute(group[i].countables[j].name,
+ strlen(group[i].countables[j].name),
+ sha1_result);
+ memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
+ }
+
+ vk_outarray_append(&out_desc, desc) {
+ desc->flags = 0;
+
+ snprintf(desc->name, sizeof(desc->name),
+ "%s", group[i].countables[j].name);
+ snprintf(desc->category, sizeof(desc->category), "%s", group[i].name);
+ snprintf(desc->description, sizeof(desc->description),
+ "%s: %s performance counter",
+ group[i].name, group[i].countables[j].name);
+ }
+ }
+ }
+
+ return vk_outarray_status(&out);
+}
+
+void
+tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
+ VkPhysicalDevice physicalDevice,
+ const VkQueryPoolPerformanceCreateInfoKHR* pPerformanceQueryCreateInfo,
+ uint32_t* pNumPasses)
+{
+ /* TODO. Should support handling multipass. */
+ *pNumPasses = 1;
+}
+
+VkResult
+tu_AcquireProfilingLockKHR(VkDevice device,
+ const VkAcquireProfilingLockInfoKHR* pInfo)
+{
+ /* TODO. Probably there's something to do for kgsl. */
+ return VK_SUCCESS;
+}
+
+void
+tu_ReleaseProfilingLockKHR(VkDevice device)
+{
+ /* TODO. Probably there's something to do for kgsl. */
+ return;
+}
More information about the mesa-commit
mailing list