[Beignet] [PATCH 2/5] Add new CL_Intel_performance_query extension support
Zhenyu Wang
zhenyuw at linux.intel.com
Tue May 12 00:32:59 PDT 2015
This adds new extension to get Intel hardware performance counter
through i915 perf event interface.
This extension is mostly based on GL_Intel_performance_query extension
which is a generic method to access performance metrics on Intel GPU.
Signed-off-by: Zhenyu Wang <zhenyuw at linux.intel.com>
---
include/CL/cl_intel.h | 94 +++
src/CMakeLists.txt | 1 +
src/cl_api.c | 238 ++++++
src/cl_context.c | 6 +
src/cl_context.h | 83 +++
src/intel/intel_defines.h | 1 +
src/intel/intel_perf.c | 1814 +++++++++++++++++++++++++++++++++++++++++++++
src/intel_perf.h | 32 +
8 files changed, 2269 insertions(+)
create mode 100644 src/intel/intel_perf.c
create mode 100644 src/intel_perf.h
diff --git a/include/CL/cl_intel.h b/include/CL/cl_intel.h
index 28bcb62..f734eb2 100644
--- a/include/CL/cl_intel.h
+++ b/include/CL/cl_intel.h
@@ -133,6 +133,100 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetMemObjectFdIntel_fn)(
cl_mem /* Memory Obejct */,
int* /* returned fd */);
+/* Intel performance query */
+
+#define PERFQUERY_COUNTER_EVENT_INTEL 0x94F0
+#define PERFQUERY_COUNTER_DURATION_NORM_INTEL 0x94F1
+#define PERFQUERY_COUNTER_DURATION_RAW_INTEL 0x94F2
+#define PERFQUERY_COUNTER_THROUGHPUT_INTEL 0x94F3
+#define PERFQUERY_COUNTER_RAW_INTEL 0x94F4
+#define PERFQUERY_COUNTER_TIMESTAMP_INTEL 0x94F5
+
+#define PERFQUERY_COUNTER_DATA_UINT32_INTEL 0x94F8
+#define PERFQUERY_COUNTER_DATA_UINT64_INTEL 0x94F9
+#define PERFQUERY_COUNTER_DATA_FLOAT_INTEL 0x94FA
+#define PERFQUERY_COUNTER_DATA_DOUBLE_INTEL 0x94FB
+#define PERFQUERY_COUNTER_DATA_BOOL32_INTEL 0x94FC
+
+typedef struct perf_query_object *cl_perf_query_intel;
+
+extern CL_API_ENTRY void CL_API_CALL
+clGetFirstPerfQueryIdIntel(cl_context ctx, cl_uint *queryId);
+typedef CL_API_ENTRY void (CL_API_CALL *clGetFirstPerfQueryIdIntel_fn)(cl_context ctx,
+ cl_uint *queryId);
+
+extern CL_API_ENTRY void CL_API_CALL
+clGetNextPerfQueryIdIntel(cl_context ctx, cl_uint queryId, cl_uint *nextQueryId);
+typedef CL_API_ENTRY void (CL_API_CALL *clGetNextPerfQueryIdIntel_fn)(cl_context ctx,
+ cl_uint queryId,
+ cl_uint *nextQueryId);
+
+extern CL_API_ENTRY void CL_API_CALL
+clGetPerfQueryInfoIntel(cl_context ctx,
+ cl_uint queryId,
+ cl_uint queryNameLength, cl_char *queryName,
+ cl_uint *dataSize, cl_uint *noCounters,
+ cl_uint *noInstances);
+typedef CL_API_ENTRY void (CL_API_CALL *clGetPerfQueryInfoIntel_fn)(cl_context ctx,
+ cl_uint queryId,
+ cl_uint queryNameLength,
+ cl_char *queryName,
+ cl_uint *dataSize,
+ cl_uint *noCounters,
+ cl_uint *noInstances);
+
+extern CL_API_ENTRY void CL_API_CALL
+clGetPerfCounterInfoIntel(cl_context ctx,
+ cl_uint queryId, cl_uint counterId,
+ cl_uint counterNameLength, cl_char *counterName,
+ cl_uint counterDescLength, cl_char *counterDesc,
+ cl_uint *counterOffset, cl_uint *counterDataSize,
+ cl_uint *counterTypeEnum, cl_uint *counterDataTypeEnum,
+ cl_ulong *rawCounterMaxValue);
+typedef CL_API_ENTRY void
+(CL_API_CALL *clGetPerfCounterInfoIntel_fn)(cl_context ctx,
+ cl_uint queryId,
+ cl_uint counterId,
+ cl_uint counterNameLength, cl_char *counterName,
+ cl_uint counterDescLength, cl_char *counterDesc,
+ cl_uint *counterOffset, cl_uint *counterDataSize,
+ cl_uint *counterTypeEnum, cl_uint *counterDataTypeEnum,
+ cl_ulong *rawCounterMaxValue);
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreatePerfQueryIntel(cl_context context, cl_uint queryId, cl_perf_query_intel *queryHandle);
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clCreatePerfQueryIntel_fn)(cl_context context,
+ cl_uint queryId,
+ cl_perf_query_intel *queryHandle);
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clDeletePerfQueryIntel(cl_context context, cl_perf_query_intel queryHandle);
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clDeletePerfQueryIntel_fn)(cl_context context,
+ cl_perf_query_intel queryHandle);
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBeginPerfQueryIntel(cl_context context, cl_perf_query_intel queryHandle);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clBeginPerfQueryIntel_fn)(cl_context context,
+ cl_perf_query_intel queryHandle);
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEndPerfQueryIntel(cl_context context, cl_perf_query_intel queryHandle);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEndPerfQueryIntel_fn)(cl_context context,
+ cl_perf_query_intel queryHandle);
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPerfQueryDataIntel(cl_context context,
+ cl_perf_query_intel queryHandle,
+ cl_uint flags, size_t dataSize, void *data,
+ cl_uint *bytesWritten);
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clGetPerfQueryDataIntel_fn)(cl_context context,
+ cl_perf_query_intel queryHandle,
+ cl_uint flags, size_t dataSize, void *data,
+ cl_uint *bytesWritten);
+
#ifdef __cplusplus
}
#endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4e67c71..555a988 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -88,6 +88,7 @@ set(OPENCL_SRC
intel/intel_gpgpu.c
intel/intel_batchbuffer.c
intel/intel_driver.c
+ intel/intel_perf.c
performance.c)
if (X11_FOUND)
diff --git a/src/cl_api.c b/src/cl_api.c
index 3e72deb..5d9de28 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -42,6 +42,8 @@
#include "performance.h"
+#include "intel_perf.h"
+
#ifndef CL_VERSION_1_2
#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2)
#define CL_DEVICE_TYPE_CUSTOM (1 << 4)
@@ -3180,6 +3182,15 @@ internal_clGetExtensionFunctionAddress(const char *func_name)
EXTFUNC(clCreateBufferFromLibvaIntel)
EXTFUNC(clCreateImageFromLibvaIntel)
EXTFUNC(clGetMemObjectFdIntel)
+ EXTFUNC(clGetFirstPerfQueryIdIntel)
+ EXTFUNC(clGetNextPerfQueryIdIntel)
+ EXTFUNC(clGetPerfQueryInfoIntel)
+ EXTFUNC(clGetPerfCounterInfoIntel)
+ EXTFUNC(clCreatePerfQueryIntel)
+ EXTFUNC(clDeletePerfQueryIntel)
+ EXTFUNC(clBeginPerfQueryIntel)
+ EXTFUNC(clEndPerfQueryIntel)
+ EXTFUNC(clGetPerfQueryDataIntel)
return NULL;
}
@@ -3348,3 +3359,230 @@ clGetMemObjectFdIntel(cl_context context,
error:
return err;
}
+
+/* Intel performance query extension */
+static bool
+_check_query_id_valid(cl_context ctx, cl_uint id)
+{
+ return (id >= ctx->perfquery.n_queries) ? false : true;
+}
+
+static bool
+_check_counter_id_valid(cl_context ctx, cl_uint query_id, cl_uint counter_id)
+{
+ if (!_check_query_id_valid(ctx, query_id))
+ return false;
+ return (counter_id >= ctx->perfquery.queries[query_id].n_counters) ? false : true;
+}
+
+
+CL_API_ENTRY void CL_API_CALL
+clGetFirstPerfQueryIdIntel(cl_context ctx, cl_uint *queryId)
+{
+ if (!ctx->perfquery.enable)
+ return;
+
+ intel_perf_query_first(ctx, queryId);
+}
+
+CL_API_ENTRY void CL_API_CALL
+clGetNextPerfQueryIdIntel(cl_context ctx, cl_uint queryId, cl_uint *nextQueryId)
+{
+ if (!ctx->perfquery.enable)
+ return;
+
+ if (!_check_query_id_valid(ctx, queryId))
+ return;
+
+ intel_perf_query_next(ctx, queryId, nextQueryId);
+}
+
+static void
+return_string(cl_char *stringRet, cl_uint stringMaxLen, const cl_char *string)
+{
+ if (!stringRet)
+ return;
+
+ strncpy(stringRet, string ? string : "", stringMaxLen);
+
+ if (stringMaxLen > 0)
+ stringRet[stringMaxLen - 1] = '\0';
+}
+
+CL_API_ENTRY void CL_API_CALL
+clGetPerfQueryInfoIntel(cl_context ctx,
+ cl_uint queryId,
+ cl_uint queryNameLength, cl_char *queryName,
+ cl_uint *dataSize, cl_uint *noCounters,
+ cl_uint *noInstances)
+{
+ cl_char *name;
+ cl_uint data_size;
+ cl_uint no_counter;
+ cl_uint no_instance;
+
+ if (!ctx->perfquery.enable)
+ return;
+
+ if (!_check_query_id_valid(ctx, queryId))
+ return;
+
+ intel_perf_query_info(ctx, queryId, &name,
+ &data_size, &no_counter, &no_instance);
+ return_string(queryName, queryNameLength, name);
+
+ if (dataSize)
+ *dataSize = data_size;
+
+ if (noCounters)
+ *noCounters = no_counter;
+
+ if (noInstances)
+ *noInstances = no_instance;
+}
+
+CL_API_ENTRY void CL_API_CALL
+clGetPerfCounterInfoIntel(cl_context ctx,
+ cl_uint queryId, cl_uint counterId,
+ cl_uint counterNameLength, cl_char *counterName,
+ cl_uint counterDescLength, cl_char *counterDesc,
+ cl_uint *counterOffset, cl_uint *counterDataSize,
+ cl_uint *counterTypeEnum, cl_uint *counterDataTypeEnum,
+ cl_ulong *rawCounterMaxValue)
+{
+ cl_char *name;
+ cl_char *desc;
+ cl_uint offset;
+ cl_uint data_size;
+ cl_uint counter_type;
+ cl_uint data_type;
+ cl_ulong raw_max;
+
+ if (!ctx->perfquery.enable)
+ return;
+
+ if (!_check_counter_id_valid(ctx, queryId, counterId))
+ return;
+
+ intel_perf_counter_info(ctx, queryId, counterId,
+ &name, &desc, &offset, &data_size,
+ &counter_type, &data_type,
+ &raw_max);
+
+ return_string(counterName, counterNameLength, name);
+ return_string(counterDesc, counterDescLength, desc);
+ if (counterOffset)
+ *counterOffset = offset;
+ if (counterDataSize)
+ *counterDataSize = data_size;
+ if (counterTypeEnum)
+ *counterTypeEnum = counter_type;
+ if (counterDataTypeEnum)
+ *counterDataTypeEnum = data_type;
+ if (rawCounterMaxValue)
+ *rawCounterMaxValue = raw_max;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clCreatePerfQueryIntel(cl_context context, cl_uint queryId, cl_perf_query_intel *queryHandle)
+{
+ cl_int err = CL_SUCCESS;
+ cl_perf_query_intel handle;
+ CHECK_CONTEXT (context);
+
+ if (!context->perfquery.enable) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (!_check_query_id_valid(context, queryId)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ /* current allow 1 instance */
+ if (context->perfquery.n_query_instances) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ err = intel_perf_query_create(context, queryId, &handle);
+
+ if (!err && queryHandle)
+ *queryHandle = handle;
+
+error:
+ return err;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clDeletePerfQueryIntel(cl_context context, cl_perf_query_intel queryHandle)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT (context);
+
+ if (!context->perfquery.enable) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ err = intel_perf_query_delete(context, queryHandle);
+
+error:
+ return err;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clBeginPerfQueryIntel(cl_context context, cl_perf_query_intel queryHandle)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT(context);
+
+ if (!context->perfquery.enable) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ err = intel_perf_query_begin(context, queryHandle);
+
+error:
+ return err;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEndPerfQueryIntel(cl_context context, cl_perf_query_intel queryHandle)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT(context);
+
+ if (!context->perfquery.enable) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ err = intel_perf_query_end(context, queryHandle);
+
+error:
+ return err;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetPerfQueryDataIntel(cl_context context,
+ cl_perf_query_intel queryHandle,
+ cl_uint flags, size_t dataSize, void *data,
+ cl_uint *bytesWritten)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT(context);
+
+ if (!context->perfquery.enable) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ err = intel_perf_query_get_data(context, queryHandle, flags,
+ dataSize, data, bytesWritten);
+
+error:
+ return err;
+}
diff --git a/src/cl_context.c b/src/cl_context.c
index 0f08e6a..148c7c6 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -32,6 +32,8 @@
#include "CL/cl.h"
#include "CL/cl_gl.h"
+#include "intel_perf.h"
+
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
@@ -150,6 +152,8 @@ cl_create_context(const cl_context_properties * properties,
ctx->pfn_notify = pfn_notify;
ctx->user_data = user_data;
+ intel_perf_query_init(ctx);
+
exit:
if (errcode_ret != NULL)
*errcode_ret = err;
@@ -196,6 +200,8 @@ cl_context_delete(cl_context ctx)
if (atomic_dec(&ctx->ref_n) > 1)
return;
+ intel_perf_query_destroy(ctx);
+
/* delete the internal programs. */
for (i = CL_INTERNAL_KERNEL_MIN; i < CL_INTERNAL_KERNEL_MAX; i++) {
if (ctx->internel_kernels[i]) {
diff --git a/src/cl_context.h b/src/cl_context.h
index 249fed8..fba81e1 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -26,6 +26,7 @@
#include "cl_khr_icd.h"
#include <stdint.h>
+#include <stdbool.h>
#include <pthread.h>
/* DRI device created at create context */
@@ -93,6 +94,28 @@ struct _cl_context_prop {
};
};
+struct perf_query_counter;
+struct perf_oa_counter;
+
+struct perf_query
+{
+ const char *name;
+ struct perf_query_counter *counters;
+ int n_counters;
+ size_t data_size;
+
+ /* OA specific */
+ int oa_metrics_set;
+ int oa_format;
+ struct perf_oa_counter *oa_counters;
+ int n_oa_counters;
+};
+
+#define MAX_PERF_QUERIES 2
+#define MAX_PERF_QUERY_COUNTERS 150
+#define MAX_OA_QUERY_COUNTERS 100
+#define MAX_RAW_OA_COUNTERS 62
+
#define IS_EGL_CONTEXT(ctx) (ctx->props.gl_type == CL_GL_EGL_DISPLAY)
#define EGL_DISP(ctx) (EGLDisplay)(ctx->props.egl_display)
#define EGL_CTX(ctx) (EGLContext)(ctx->props.gl_context)
@@ -127,6 +150,66 @@ struct _cl_context {
/* User's callback when error occur in context */
void *user_data; /* A pointer to user supplied data */
+ struct {
+ struct perf_query queries[MAX_PERF_QUERIES];
+ int n_queries;
+ bool enable;
+
+ /* A common OA counter that we want to read directly in several places */
+ uint64_t (*read_oa_report_timestamp)(uint32_t *report);
+
+ /* Needed to normalize counters aggregated across all EUs */
+ int eu_count;
+
+ /* The i915_oa perf event we open to setup + enable the OA counters */
+ int perf_oa_event_fd;
+
+ /* An i915_oa perf event fd gives exclusive access to the OA unit that
+ * will report counter snapshots for a specific counter set/profile in a
+ * specific layout/format so we can only start OA queries that are
+ * compatible with the currently open fd... */
+ int perf_oa_metrics_set;
+ int perf_oa_format;
+
+ /* The mmaped circular buffer for collecting samples from perf */
+ uint8_t *perf_oa_mmap_base;
+ size_t perf_oa_buffer_size;
+ struct perf_event_mmap_page *perf_oa_mmap_page;
+
+ /* The system's page size */
+ unsigned int page_size;
+
+ /* TODO: generalize and split these into an array indexed by the
+ * query type... */
+ int n_active_oa_queries;
+
+ /* The number of queries depending on running OA counters which
+ * extends beyond brw_end_perf_query() since we need to wait until
+ * the last MI_RPC command has been written. */
+ int n_oa_users;
+
+ /* We also get the gpu to write an ID for snapshots corresponding
+ * to the beginning and end of a query, but for simplicity these
+ * IDs use a separate namespace. */
+ int next_query_start_report_id;
+
+ /**
+ * An array of queries whose results haven't yet been assembled based on
+ * the data in buffer objects.
+ *
+ * These may be active, or have already ended. However, the results
+ * have not been requested.
+ */
+ struct perf_query_object **unresolved;
+ int unresolved_elements;
+ int unresolved_array_size;
+
+ /* The total number of query objects so we can relinquish
+ * our exclusive access to perf if the application deletes
+ * all of its objects. (NB: We only disable perf while
+ * there are no active queries) */
+ int n_query_instances;
+ } perfquery;
};
/* Implement OpenCL function */
diff --git a/src/intel/intel_defines.h b/src/intel/intel_defines.h
index 1080a91..50b835d 100644
--- a/src/intel/intel_defines.h
+++ b/src/intel/intel_defines.h
@@ -321,6 +321,7 @@
#define GEN7_PIPE_CONTROL_WRITE_TIMESTAMP (3 << 14)
#define GEN7_PIPE_CONTROL_GLOBAL_GTT_WRITE (1 << 2)
+#define MI_REPORT_PERF_COUNT ((0x28 << 23) | (3 - 2))
#define GEN_MAPFILTER_NEAREST 0x0
#define GEN_MAPFILTER_LINEAR 0x1
diff --git a/src/intel/intel_perf.c b/src/intel/intel_perf.c
new file mode 100644
index 0000000..65ec2a1
--- /dev/null
+++ b/src/intel/intel_perf.c
@@ -0,0 +1,1814 @@
+/*
+ * Copyright 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Author:
+ * Zhenyu Wang <zhenyuw at linux.intel.com>
+ */
+
+#include <linux/perf_event.h>
+
+#include <asm/unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <stropts.h>
+
+#include <limits.h>
+#include <errno.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "intel_driver.h"
+#include "intel_perf.h"
+
+#include "intel/intel_gpgpu.h"
+#include "intel/intel_defines.h"
+#include "intel/intel_batchbuffer.h"
+
+#include "cl_context.h"
+#include "cl_command_queue.h"
+#include "cl_device_id.h"
+
+#include "i915_drm.h"
+
+#define DBG(fmt, args...) fprintf(stderr, fmt, ##args)
+
+/* Describes how to read one OA counter which might be a raw counter read
+ * directly from a counter snapshot or could be a higher level counter derived
+ * from one or more raw counters.
+ *
+ * Raw counters will have set ->report_offset to the snapshot offset and have
+ * an accumulator that can consider counter overflow according to the width of
+ * that counter.
+ *
+ * Higher level counters can currently reference up to 3 other counters + use
+ * ->config for anything. They don't need an accumulator.
+ *
+ * The data type that will be written to *value_out by the read function can
+ * be determined by ->data_type
+ */
+struct perf_oa_counter
+{
+ struct perf_oa_counter *reference0;
+ struct perf_oa_counter *reference1;
+ struct perf_oa_counter *reference2;
+ union {
+ int report_offset;
+ int config;
+ };
+
+ int accumulator_index;
+ void (*accumulate)(struct perf_oa_counter *counter,
+ uint32_t *start,
+ uint32_t *end,
+ uint64_t *accumulator);
+ unsigned int data_type;
+ void (*read)(struct perf_oa_counter *counter,
+ uint64_t *accumulated,
+ void *value_out);
+};
+
+/* A counter that will be advertised and reported to applications */
+struct perf_query_counter
+{
+ const char *name;
+ const char *desc;
+ unsigned int type;
+ unsigned int data_type;
+ uint64_t raw_max;
+ size_t offset;
+ size_t size;
+
+ union {
+ struct perf_oa_counter *oa_counter;
+ uint32_t pipeline_stat_reg;
+ };
+};
+
+struct perf_query_builder
+{
+ cl_context ctx;
+ struct perf_query *query;
+ size_t offset;
+ int next_accumulator_index;
+
+ int a_offset;
+ int b_offset;
+ int c_offset;
+
+ struct perf_oa_counter *gpu_core_clock;
+};
+
+/**
+ * i965 representation of a performance query object.
+ *
+ * NB: We want to keep this structure relatively lean considering that
+ * applications may expect to allocate enough objects to be able to
+ * query around all draw calls in a frame.
+ */
+struct perf_query_object
+{
+ const struct perf_query *query;
+
+ /* Use own batch for perf bo */
+ intel_batchbuffer_t *batch;
+
+ struct {
+
+ /**
+ * BO containing OA counter snapshots at query Begin/End time.
+ */
+ drm_intel_bo *bo;
+ int current_report_id;
+
+ /**
+ * We collect periodic counter snapshots via perf so we can account
+ * for counter overflow and this is a pointer into the circular
+ * perf buffer for collecting snapshots that lie within the begin-end
+ * bounds of this query.
+ */
+ unsigned int perf_tail;
+
+ /**
+ * Storage the final accumulated OA counters.
+ */
+ uint64_t accumulator[MAX_RAW_OA_COUNTERS];
+
+ /**
+ * false while in the unresolved_elements list, and set to true when
+ * the final, end MI_RPC snapshot has been accumulated.
+ */
+ bool results_accumulated;
+
+ } oa;
+};
+
+/* Samples read from the perf circular buffer */
+struct oa_perf_sample {
+ struct perf_event_header header;
+ uint32_t raw_size;
+ uint8_t raw_data[];
+};
+#define MAX_OA_PERF_SAMPLE_SIZE (8 + /* perf_event_header */ \
+ 4 + /* raw_size */ \
+ 256 + /* raw OA counter snapshot */ \
+ 4) /* alignment padding */
+
+#define TAKEN(HEAD, TAIL, POT_SIZE) (((HEAD) - (TAIL)) & (POT_SIZE - 1))
+
+/* Note: this will equate to 0 when the buffer is exactly full... */
+#define REMAINING(HEAD, TAIL, POT_SIZE) (POT_SIZE - TAKEN (HEAD, TAIL, POT_SIZE))
+
+#if defined(__i386__)
+#define rmb() __asm__ volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#define mb() __asm__ volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#endif
+
+#if defined(__x86_64__)
+#define rmb() __asm__ volatile("lfence" ::: "memory")
+#define mb() __asm__ volatile("mfence" ::: "memory")
+#endif
+
+/* TODO: consider using <stdatomic.h> something like:
+ *
+ * #define rmb() atomic_thread_fence(memory_order_seq_consume)
+ * #define mb() atomic_thread_fence(memory_order_seq_cst)
+ */
+
+/* Allow building for a more recent kernel than the system headers
+ * correspond too... */
+#ifndef PERF_EVENT_IOC_FLUSH
+#include <linux/ioctl.h>
+#define PERF_EVENT_IOC_FLUSH _IO ('$', 8)
+#endif
+
+#define SECOND_SNAPSHOT_OFFSET_IN_BYTES 2048
+
+static inline size_t
+pot_align(size_t base, int pot_alignment)
+{
+ return (base + pot_alignment - 1) & ~(pot_alignment - 1);
+}
+
+/******************************************************************************/
+/**
+ * Emit an MI_REPORT_PERF_COUNT command packet.
+ *
+ * This writes the current OA counter values to buffer.
+ */
+static void
+emit_mi_report_perf_count(cl_context ctx,
+ struct perf_query_object *obj,
+ uint32_t offset_in_bytes,
+ uint32_t report_id)
+{
+ drm_intel_bo *bo = obj->oa.bo;
+
+ assert(offset_in_bytes % 64 == 0);
+
+ intel_batchbuffer_reset(obj->batch, 512);
+
+ /* Reports apparently don't always get written unless we flush first. */
+ /* XXX required? need to call pipe_control function in intel_gpgpu.c */
+ // intel_batchbuffer_emit_mi_flush(brw);
+
+ BEGIN_BATCH(obj->batch, 3);
+ OUT_BATCH(obj->batch, MI_REPORT_PERF_COUNT);
+ OUT_RELOC(obj->batch, bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+ offset_in_bytes);
+ OUT_BATCH(obj->batch, report_id);
+ ADVANCE_BATCH(obj->batch);
+
+ intel_batchbuffer_flush(obj->batch);
+
+ /* XXX */
+ /* Reports apparently don't always get written unless we flush after. */
+ //intel_batchbuffer_emit_mi_flush(brw);
+}
+
+static unsigned int
+read_perf_head(struct perf_event_mmap_page *mmap_page)
+{
+ unsigned int head = (*(volatile uint64_t *)&mmap_page->data_head);
+ rmb();
+
+ return head;
+}
+
+static void
+write_perf_tail(struct perf_event_mmap_page *mmap_page,
+ unsigned int tail)
+{
+ /* Make sure we've finished reading all the sample data we
+ * we're consuming before updating the tail... */
+ mb();
+ mmap_page->data_tail = tail;
+}
+
+/* Update the real perf tail pointer according to the query tail that
+ * is currently furthest behind...
+ */
+static void
+update_perf_tail(cl_context ctx)
+{
+ unsigned int size = ctx->perfquery.perf_oa_buffer_size;
+ unsigned int head = read_perf_head(ctx->perfquery.perf_oa_mmap_page);
+ int straggler_taken = -1;
+ unsigned int straggler_tail;
+ int i;
+
+ for (i = 0; i < ctx->perfquery.unresolved_elements; i++) {
+ struct perf_query_object *obj = ctx->perfquery.unresolved[i];
+ int taken;
+
+ if (!obj->oa.bo)
+ continue;
+
+ taken = TAKEN(head, obj->oa.perf_tail, size);
+
+ if (taken > straggler_taken) {
+ straggler_taken = taken;
+ straggler_tail = obj->oa.perf_tail;
+ }
+ }
+
+ if (straggler_taken >= 0)
+ write_perf_tail(ctx->perfquery.perf_oa_mmap_page, straggler_tail);
+}
+
+/**
+ * Add a query to the global list of "unresolved queries."
+ *
+ * Queries are "unresolved" until all the counter snapshots have been
+ * accumulated via accumulate_oa_snapshots() after the end MI_REPORT_PERF_COUNT
+ * has landed in query->oa.bo.
+ */
+static void
+add_to_unresolved_query_list(cl_context ctx,
+ struct perf_query_object *obj)
+{
+ if (ctx->perfquery.unresolved_elements >=
+ ctx->perfquery.unresolved_array_size) {
+ ctx->perfquery.unresolved_array_size *= 1.5;
+ ctx->perfquery.unresolved = realloc(ctx->perfquery.unresolved,
+ sizeof(struct perf_query_object *) *
+ ctx->perfquery.unresolved_array_size);
+ }
+
+ ctx->perfquery.unresolved[ctx->perfquery.unresolved_elements++] = obj;
+
+ if (obj->oa.bo)
+ update_perf_tail(ctx);
+}
+
+/**
+ * Remove a query from the global list of "unresolved queries." once
+ * the end MI_RPC OA counter snapshot has been accumulated, or when
+ * discarding unwanted query results.
+ */
+static void
+drop_from_unresolved_query_list(cl_context ctx,
+ struct perf_query_object *obj)
+{
+ int i;
+
+ for (i = 0; i < ctx->perfquery.unresolved_elements; i++) {
+ if (ctx->perfquery.unresolved[i] == obj) {
+ int last_elt = --ctx->perfquery.unresolved_elements;
+
+ if (i == last_elt)
+ ctx->perfquery.unresolved[i] = NULL;
+ else
+ ctx->perfquery.unresolved[i] = ctx->perfquery.unresolved[last_elt];
+
+ break;
+ }
+ }
+
+ if (obj->oa.bo)
+ update_perf_tail(ctx);
+}
+
+static uint64_t
+read_report_timestamp(cl_context ctx, uint32_t *report)
+{
+ return ctx->perfquery.read_oa_report_timestamp(report);
+}
+
+/**
+ * Given pointers to starting and ending OA snapshots, add the deltas for each
+ * counter to the results.
+ */
+static void
+add_deltas(cl_context ctx,
+ struct perf_query_object *obj,
+ uint32_t *start, uint32_t *end)
+{
+ const struct perf_query *query = obj->query;
+ int i;
+
+#if 0
+ fprintf(stderr, "Accumulating delta:\n");
+ fprintf(stderr, "> Start timestamp = %" PRIu64 "\n", read_report_timestamp(ctx, start));
+ fprintf(stderr, "> End timestamp = %" PRIu64 "\n", read_report_timestamp(ctx, end));
+#endif
+
+ for (i = 0; i < query->n_oa_counters; i++) {
+ struct perf_oa_counter *oa_counter = &query->oa_counters[i];
+ //uint64_t pre_accumulate;
+
+ if (!oa_counter->accumulate)
+ continue;
+
+ //pre_accumulate = query->oa.accumulator[counter->id];
+ oa_counter->accumulate(oa_counter,
+ start, end,
+ obj->oa.accumulator);
+#if 0
+ fprintf(stderr, "> Updated %s from %" PRIu64 " to %" PRIu64 "\n",
+ counter->name, pre_accumulate,
+ query->oa.accumulator[counter->id]);
+#endif
+ }
+}
+
+/* Handle restarting ioctl if interupted... */
+static int
+perf_ioctl(int fd, unsigned long request, void *arg)
+{
+ int ret;
+
+ do {
+ ret = ioctl(fd, request, arg);
+ } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
+ return ret;
+}
+
+static bool
+inc_n_oa_users(cl_context ctx)
+{
+ if (ctx->perfquery.n_oa_users == 0 &&
+ perf_ioctl(ctx->perfquery.perf_oa_event_fd,
+ PERF_EVENT_IOC_ENABLE, 0) < 0)
+ {
+ return false;
+ }
+ ++ctx->perfquery.n_oa_users;
+
+ return true;
+}
+
+static void
+dec_n_oa_users(cl_context ctx)
+{
+ /* Disabling the i915_oa event will effectively disable the OA
+ * counters. Note it's important to be sure there are no outstanding
+ * MI_RPC commands at this point since they could stall the CS
+ * indefinitely once OACONTROL is disabled.
+ */
+ --ctx->perfquery.n_oa_users;
+ if (ctx->perfquery.n_oa_users == 0 &&
+ perf_ioctl(ctx->perfquery.perf_oa_event_fd,
+ PERF_EVENT_IOC_DISABLE, 0) < 0)
+ {
+ DBG("WARNING: Error disabling i915_oa perf event: %m\n");
+ }
+}
+
+/**
+ * Accumulate OA counter results from a series of snapshots.
+ *
+ * N.B. We write snapshots for the beginning and end of a query into
+ * query->oa.bo as well as collect periodic snapshots from the Linux
+ * perf interface.
+ *
+ * These periodic snapshots help to ensure we handle counter overflow
+ * correctly by being frequent enough to ensure we don't miss multiple
+ * wrap overflows of a counter between snapshots.
+ */
+static void
+accumulate_oa_snapshots(cl_context ctx,
+ struct perf_query_object *obj)
+{
+ uint32_t *query_buffer;
+ uint8_t *data = ctx->perfquery.perf_oa_mmap_base + ctx->perfquery.page_size;
+ const unsigned int size = ctx->perfquery.perf_oa_buffer_size;
+ const uint64_t mask = size - 1;
+ uint64_t head;
+ uint64_t tail;
+ uint32_t *start;
+ uint64_t start_timestamp;
+ uint32_t *last;
+ uint32_t *end;
+ uint64_t end_timestamp;
+ uint8_t scratch[MAX_OA_PERF_SAMPLE_SIZE];
+
+ if (perf_ioctl(ctx->perfquery.perf_oa_event_fd,
+ PERF_EVENT_IOC_FLUSH, 0) < 0)
+ DBG("Failed to flush outstanding perf events: %m\n");
+
+ drm_intel_bo_map(obj->oa.bo, false);
+ query_buffer = obj->oa.bo->virtual;
+
+ start = last = query_buffer;
+ end = query_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / sizeof(uint32_t));
+
+#warning "TODO: find a way to report OA errors from the kernel"
+ /* XXX: Is there anything we can do to handle this gracefully/
+ * report the error to the application? */
+ if (start[0] != obj->oa.current_report_id)
+ DBG("Spurious start report id=%"PRIu32"\n", start[0]);
+ if (end[0] != (obj->oa.current_report_id + 1))
+ DBG("Spurious end report id=%"PRIu32"\n", start[0]);
+
+ start_timestamp = read_report_timestamp(ctx, start);
+ end_timestamp = read_report_timestamp(ctx, end);
+
+ head = read_perf_head(ctx->perfquery.perf_oa_mmap_page);
+ tail = obj->oa.perf_tail;
+
+ //fprintf(stderr, "Handle event mask = 0x%" PRIx64
+ // " head=%" PRIu64 " tail=%" PRIu64 "\n", mask, head, tail);
+
+ while (TAKEN(head, tail, size)) {
+ const struct perf_event_header *header =
+ (const struct perf_event_header *)(data + (tail & mask));
+
+ if (header->size == 0) {
+ DBG("Spurious header size == 0\n");
+ /* XXX: How should we handle this instead of exiting() */
+#warning "FIXME: avoid exit(1) in error condition"
+ exit(1);
+ }
+
+ if (header->size > (head - tail)) {
+ DBG("Spurious header size would overshoot head\n");
+ /* XXX: How should we handle this instead of exiting() */
+ exit(1);
+ }
+
+ //fprintf(stderr, "header = %p tail=%" PRIu64 " size=%d\n",
+ // header, tail, header->size);
+
+ if ((const uint8_t *)header + header->size > data + size) {
+ int before;
+
+ if (header->size > MAX_OA_PERF_SAMPLE_SIZE) {
+ DBG("Skipping spurious sample larger than expected\n");
+ tail += header->size;
+ continue;
+ }
+
+ before = data + size - (const uint8_t *)header;
+
+ memcpy(scratch, header, before);
+ memcpy(scratch + before, data, header->size - before);
+
+ header = (struct perf_event_header *)scratch;
+ //fprintf(stderr, "DEBUG: split\n");
+ //exit(1);
+ }
+
+ switch (header->type) {
+ case PERF_RECORD_LOST: {
+ struct {
+ struct perf_event_header header;
+ uint64_t id;
+ uint64_t n_lost;
+ } *lost = (void *)header;
+ DBG("i915_oa: Lost %" PRIu64 " events\n", lost->n_lost);
+ break;
+ }
+
+ case PERF_RECORD_THROTTLE:
+ DBG("i915_oa: Sampling has been throttled\n");
+ break;
+
+ case PERF_RECORD_UNTHROTTLE:
+ DBG("i915_oa: Sampling has been unthrottled\n");
+ break;
+
+ case PERF_RECORD_SAMPLE: {
+ struct oa_perf_sample *perf_sample = (struct oa_perf_sample *)header;
+ uint32_t *report = (uint32_t *)perf_sample->raw_data;
+ uint64_t timestamp = read_report_timestamp(ctx, report);
+
+ if (timestamp >= end_timestamp)
+ goto end;
+
+ if (timestamp > start_timestamp) {
+ add_deltas(ctx, obj, last, report);
+ last = report;
+ }
+
+ break;
+ }
+
+ default:
+ DBG("i915_oa: Spurious header type = %d\n", header->type);
+ }
+
+ //fprintf(stderr, "Tail += %d\n", header->size);
+
+ tail += header->size;
+ }
+
+end:
+
+ add_deltas(ctx, obj, last, end);
+
+ DBG("Marking %p resolved - results gathered\n", obj);
+
+ drm_intel_bo_unmap(obj->oa.bo);
+ obj->oa.results_accumulated = true;
+ drop_from_unresolved_query_list(ctx, obj);
+ dec_n_oa_users(ctx);
+}
+
+/******************************************************************************/
+
+static uint64_t
+read_file_uint64 (const char *file)
+{
+ char buf[32];
+ int fd, n;
+
+ fd = open(file, 0);
+ if (fd < 0)
+ return 0;
+ n = read(fd, buf, sizeof (buf) - 1);
+ close(fd);
+ if (n < 0)
+ return 0;
+
+ buf[n] = '\0';
+ return strtoull(buf, 0, 0);
+}
+
+static uint64_t
+lookup_i915_oa_id (void)
+{
+ return read_file_uint64("/sys/bus/event_source/devices/i915_oa/type");
+}
+
+static long
+perf_event_open (struct perf_event_attr *hw_event,
+ pid_t pid,
+ int cpu,
+ int group_fd,
+ unsigned long flags)
+{
+ return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
+}
+
+static bool
+open_i915_oa_event(cl_context ctx,
+ int metrics_set,
+ int report_format,
+ int period_exponent,
+ int drm_fd,
+ uint32_t ctx_id)
+{
+ struct perf_event_attr attr;
+ drm_i915_oa_attr_t oa_attr;
+ int event_fd;
+ void *mmap_base;
+
+ memset(&attr, 0, sizeof(attr));
+ attr.size = sizeof(attr);
+ attr.type = lookup_i915_oa_id();
+
+ attr.sample_type = PERF_SAMPLE_RAW;
+ attr.disabled = 1; /* initially off */
+ attr.sample_period = 1;
+
+ memset(&oa_attr, 0, sizeof(oa_attr));
+ oa_attr.size = sizeof(oa_attr);
+
+ oa_attr.format = report_format;
+ oa_attr.metrics_set = metrics_set;
+ oa_attr.timer_exponent = period_exponent;
+
+ oa_attr.single_context = true;
+ oa_attr.ctx_id = ctx_id;
+ oa_attr.drm_fd = drm_fd;
+
+ attr.config = (uint64_t)&oa_attr;
+
+ event_fd = perf_event_open(&attr,
+ -1, /* pid */
+ 0, /* cpu */
+ -1, /* group fd */
+ PERF_FLAG_FD_CLOEXEC); /* flags */
+ if (event_fd == -1) {
+ DBG("Error opening i915_oa perf event: %m\n");
+ return false;
+ }
+
+ /* NB: A read-write mapping ensures the kernel will stop writing data when
+ * the buffer is full, and will report samples as lost. */
+ mmap_base = mmap(NULL,
+ ctx->perfquery.perf_oa_buffer_size + ctx->perfquery.page_size,
+ PROT_READ | PROT_WRITE, MAP_SHARED, event_fd, 0);
+ if (mmap_base == MAP_FAILED) {
+ DBG("Error mapping circular buffer, %m\n");
+ close (event_fd);
+ return false;
+ }
+
+ ctx->perfquery.perf_oa_event_fd = event_fd;
+ ctx->perfquery.perf_oa_mmap_base = mmap_base;
+ ctx->perfquery.perf_oa_mmap_page = mmap_base;
+
+ ctx->perfquery.perf_oa_metrics_set = metrics_set;
+ ctx->perfquery.perf_oa_format = report_format;
+
+ return true;
+}
+
+static bool
+begin_perf_query(cl_context ctx,
+ struct perf_query_object *obj)
+{
+ const struct perf_query *query = obj->query;
+ intel_driver_t *driver = (intel_driver_t *)ctx->drv;
+
+ /* If the OA counters aren't already on, enable them. */
+ if (ctx->perfquery.perf_oa_event_fd == -1) {
+ uint32_t ctx_id = drm_intel_gem_context_get_context_id(driver->ctx);
+ int period_exponent;
+
+ /* The timestamp for HSW+ increments every 80ns
+ *
+ * The period_exponent gives a sampling period as follows:
+ * sample_period = 80ns * 2^(period_exponent + 1)
+ *
+ * The overflow period for Haswell can be calculated as:
+ *
+ * 2^32 / (n_eus * max_gen_freq * 2)
+ * (E.g. 40 EUs @ 1GHz = ~53ms)
+ *
+ * We currently sample every 42 milliseconds...
+ */
+ period_exponent = 18;
+
+ if (!open_i915_oa_event(ctx,
+ query->oa_metrics_set,
+ query->oa_format,
+ period_exponent,
+ driver->fd,
+ ctx_id))
+ return false;
+ } else {
+ /* Opening an i915_oa event fd implies exclusive access to
+ * the OA unit which will generate counter reports for a
+ * specific counter set/profile with a specific layout/format
+ * so we can't begin any OA based queries that require a
+ * different profile or format unless we get an opportunity
+ * to close the event fd and open a new one...
+ */
+ if (ctx->perfquery.perf_oa_metrics_set != query->oa_metrics_set ||
+ ctx->perfquery.perf_oa_format != query->oa_format)
+ {
+ return false;
+ }
+ }
+
+ if (!inc_n_oa_users(ctx)) {
+ DBG("WARNING: Error enabling i915_oa perf event: %m\n");
+ return false;
+ }
+
+ if (obj->oa.bo) {
+ drm_intel_bo_unreference(obj->oa.bo);
+ obj->oa.bo = NULL;
+ }
+
+ obj->oa.bo =
+ drm_intel_bo_alloc(driver->bufmgr, "perf. query OA bo", 4096, 64);
+#ifdef DEBUG
+ /* Pre-filling the BO helps debug whether writes landed. */
+ drm_intel_bo_map(obj->oa.bo, true);
+ memset((char *) obj->oa.bo->virtual, 0x80, 4096);
+ drm_intel_bo_unmap(obj->oa.bo);
+#endif
+
+ obj->oa.current_report_id = ctx->perfquery.next_query_start_report_id;
+ ctx->perfquery.next_query_start_report_id += 2;
+
+ /* Take a starting OA counter snapshot. */
+ emit_mi_report_perf_count(ctx, obj, 0,
+ obj->oa.current_report_id);
+ ++ctx->perfquery.n_active_oa_queries;
+
+ /* Each unresolved query maintains a separate tail pointer into the
+ * circular perf sample buffer. The real tail pointer in
+ * perfquery.perf_oa_mmap_page.data_tail will correspond to the query
+ * tail that is furthest behind.
+ */
+ obj->oa.perf_tail = read_perf_head(ctx->perfquery.perf_oa_mmap_page);
+
+ memset(obj->oa.accumulator, 0, sizeof(obj->oa.accumulator));
+ obj->oa.results_accumulated = false;
+
+ add_to_unresolved_query_list(ctx, obj);
+
+ return true;
+}
+
+static void
+end_perf_query(cl_context ctx,
+ struct perf_query_object *obj)
+{
+ /* Take an ending OA counter snapshot. */
+ emit_mi_report_perf_count(ctx, obj,
+ SECOND_SNAPSHOT_OFFSET_IN_BYTES,
+ obj->oa.current_report_id + 1);
+ --ctx->perfquery.n_active_oa_queries;
+
+ /* NB: even though the query has now ended, it can't be resolved
+ * until the end MI_REPORT_PERF_COUNT snapshot has been written
+ * to query->oa.bo */
+}
+
+static void
+wait_perf_query(cl_context ctx,
+ struct perf_query_object *obj)
+{
+ drm_intel_bo *bo = NULL;
+
+ bo = obj->oa.bo;
+ if (bo == NULL)
+ return;
+
+ /* If the current batch references our results bo then we need to
+ * flush first... */
+ if (drm_intel_bo_references(obj->batch->buffer, bo))
+ intel_batchbuffer_flush(obj->batch);
+
+#if 0 /* XXX */
+ if (drm_intel_bo_busy(bo))
+ perf_debug("Stalling GPU waiting for a performance query object.\n");
+#endif
+
+ drm_intel_bo_wait_rendering(bo);
+}
+
+/**
+ * Is a performance query result available?
+ */
+static bool
+is_perf_query_ready(cl_context ctx,
+ struct perf_query_object *obj)
+{
+ return (obj->oa.results_accumulated ||
+ (obj->oa.bo &&
+ !drm_intel_bo_references(obj->batch->buffer, obj->oa.bo) &&
+ !drm_intel_bo_busy(obj->oa.bo)));
+}
+
+
+/******************************************************************************/
+
+/* Type safe wrappers for reading OA counter values */
+
+static uint64_t
+read_uint64_oa_counter(struct perf_oa_counter *counter, uint64_t *accumulated)
+{
+ uint64_t value;
+
+ assert(counter->data_type == PERFQUERY_COUNTER_DATA_UINT64_INTEL);
+
+ counter->read(counter, accumulated, &value);
+
+ return value;
+}
+
+static float
+read_float_oa_counter(struct perf_oa_counter *counter, uint64_t *accumulated)
+{
+ float value;
+
+ assert(counter->data_type == PERFQUERY_COUNTER_DATA_FLOAT_INTEL);
+
+ counter->read(counter, accumulated, &value);
+
+ return value;
+}
+
+/******************************************************************************/
+
+/*
+ * OA counter normalisation support...
+ */
+
+static void
+read_accumulated_oa_counter_cb(struct perf_oa_counter *counter,
+ uint64_t *accumulator,
+ void *value)
+{
+ *((uint64_t *)value) = accumulator[counter->accumulator_index];
+}
+
+static void
+accumulate_uint32_cb(struct perf_oa_counter *counter,
+ uint32_t *report0,
+ uint32_t *report1,
+ uint64_t *accumulator)
+{
+ accumulator[counter->accumulator_index] +=
+ (uint32_t)(report1[counter->report_offset] -
+ report0[counter->report_offset]);
+}
+
+#if 0
+/* XXX: we should factor this out for now, but notably BDW has 40bit counters... */
+static void
+accumulate_uint40_cb(struct perf_oa_counter *counter,
+ uint32_t *report0,
+ uint32_t *report1,
+ uint64_t *accumulator)
+{
+ uint32_t value0 = report0[counter->report_offset];
+ uint32_t value1 = report1[counter->report_offset];
+ uint64_t delta;
+
+ if (value0 > value1)
+ delta = (1ULL << 40) + value1 - value0;
+ else
+ delta = value1 - value0;
+
+ accumulator[counter->accumulator_index] += delta;
+}
+#endif
+
+static struct perf_oa_counter *
+add_raw_oa_counter(struct perf_query_builder *builder, int report_offset)
+{
+ struct perf_oa_counter *counter =
+ &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+ counter->report_offset = report_offset;
+ counter->accumulator_index = builder->next_accumulator_index++;
+ counter->accumulate = accumulate_uint32_cb;
+ counter->read = read_accumulated_oa_counter_cb;
+ counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+
+ return counter;
+}
+
+static uint64_t
+hsw_read_report_timestamp(uint32_t *report)
+{
+ /* The least significant timestamp bit represents 80ns on Haswell */
+ return ((uint64_t)report[1]) * 80;
+}
+
+static void
+accumulate_hsw_elapsed_cb(struct perf_oa_counter *counter,
+ uint32_t *report0,
+ uint32_t *report1,
+ uint64_t *accumulator)
+{
+ uint64_t timestamp0 = hsw_read_report_timestamp(report0);
+ uint64_t timestamp1 = hsw_read_report_timestamp(report1);
+
+ accumulator[counter->accumulator_index] += (timestamp1 - timestamp0);
+}
+
+static struct perf_oa_counter *
+add_hsw_elapsed_oa_counter(struct perf_query_builder *builder)
+{
+ struct perf_oa_counter *counter =
+ &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+ counter->accumulator_index = builder->next_accumulator_index++;
+ counter->accumulate = accumulate_hsw_elapsed_cb;
+ counter->read = read_accumulated_oa_counter_cb;
+ counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+
+ return counter;
+}
+
+static void
+read_frequency_cb(struct perf_oa_counter *counter,
+ uint64_t *accumulated,
+ void *value) /* uint64 */
+{
+ uint64_t clk_delta = read_uint64_oa_counter(counter->reference0, accumulated);
+ uint64_t time_delta = read_uint64_oa_counter(counter->reference1, accumulated);
+ uint64_t *ret = value;
+
+ if (!clk_delta) {
+ *ret = 0;
+ return;
+ }
+
+ *ret = (clk_delta * 1000) / time_delta;
+}
+
+static struct perf_oa_counter *
+add_avg_frequency_oa_counter(struct perf_query_builder *builder,
+ struct perf_oa_counter *timestamp)
+{
+ struct perf_oa_counter *counter =
+ &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+ assert(timestamp->data_type == PERFQUERY_COUNTER_DATA_UINT64_INTEL);
+
+ counter->reference0 = builder->gpu_core_clock;
+ counter->reference1 = timestamp;
+ counter->read = read_frequency_cb;
+ counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+
+ return counter;
+}
+
+static void
+read_oa_counter_normalized_by_gpu_duration_cb(struct perf_oa_counter *counter,
+ uint64_t *accumulated,
+ void *value) /* float */
+{
+ uint64_t delta = read_uint64_oa_counter(counter->reference0, accumulated);
+ uint64_t clk_delta = read_uint64_oa_counter(counter->reference1, accumulated);
+ float *ret = value;
+
+ if (!clk_delta) {
+ *ret = 0;
+ return;
+ }
+
+ *ret = ((double)delta * 100.0) / (double)clk_delta;
+}
+
+static struct perf_oa_counter *
+add_oa_counter_normalised_by_gpu_duration(struct perf_query_builder *builder,
+ struct perf_oa_counter *raw)
+{
+ struct perf_oa_counter *counter =
+ &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+ counter->reference0 = raw;
+ counter->reference1 = builder->gpu_core_clock;
+ counter->read = read_oa_counter_normalized_by_gpu_duration_cb;
+ counter->data_type = PERFQUERY_COUNTER_DATA_FLOAT_INTEL;
+
+ return counter;
+}
+
+static void
+read_hsw_samplers_busy_duration_cb(struct perf_oa_counter *counter,
+ uint64_t *accumulated,
+ void *value) /* float */
+{
+ uint64_t sampler0_busy = read_uint64_oa_counter(counter->reference0, accumulated);
+ uint64_t sampler1_busy = read_uint64_oa_counter(counter->reference1, accumulated);
+ uint64_t clk_delta = read_uint64_oa_counter(counter->reference2, accumulated);
+ float *ret = value;
+
+ if (!clk_delta) {
+ *ret = 0;
+ return;
+ }
+
+ *ret = ((double)(sampler0_busy + sampler1_busy) * 100.0) / ((double)clk_delta * 2.0);
+}
+
+static struct perf_oa_counter *
+add_hsw_samplers_busy_duration_oa_counter(struct perf_query_builder *builder,
+ struct perf_oa_counter *sampler0_busy_raw,
+ struct perf_oa_counter *sampler1_busy_raw)
+{
+ struct perf_oa_counter *counter =
+ &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+ counter->reference0 = sampler0_busy_raw;
+ counter->reference1 = sampler1_busy_raw;
+ counter->reference2 = builder->gpu_core_clock;
+ counter->read = read_hsw_samplers_busy_duration_cb;
+ counter->data_type = PERFQUERY_COUNTER_DATA_FLOAT_INTEL;
+
+ return counter;
+}
+
+static void
+read_hsw_slice_extrapolated_cb(struct perf_oa_counter *counter,
+ uint64_t *accumulated,
+ void *value) /* float */
+{
+ uint64_t counter0 = read_uint64_oa_counter(counter->reference0, accumulated);
+ uint64_t counter1 = read_uint64_oa_counter(counter->reference1, accumulated);
+ int eu_count = counter->config;
+ uint64_t *ret = value;
+
+ *ret = (counter0 + counter1) * eu_count;
+}
+
+static struct perf_oa_counter *
+add_hsw_slice_extrapolated_oa_counter(struct perf_query_builder *builder,
+ struct perf_oa_counter *counter0,
+ struct perf_oa_counter *counter1)
+{
+ struct perf_oa_counter *counter =
+ &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+ counter->reference0 = counter0;
+ counter->reference1 = counter1;
+ counter->config = builder->ctx->perfquery.eu_count;
+ counter->read = read_hsw_slice_extrapolated_cb;
+ counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+
+ return counter;
+}
+
+static void
+read_oa_counter_normalized_by_eu_duration_cb(struct perf_oa_counter *counter,
+ uint64_t *accumulated,
+ void *value) /* float */
+{
+ uint64_t delta = read_uint64_oa_counter(counter->reference0, accumulated);
+ uint64_t clk_delta = read_uint64_oa_counter(counter->reference1, accumulated);
+ float *ret = value;
+
+ if (!clk_delta) {
+ *ret = 0;
+ return;
+ }
+
+ delta /= counter->config; /* EU count */
+
+ *ret = (double)delta * 100.0 / (double)clk_delta;
+}
+
+static struct perf_oa_counter *
+add_oa_counter_normalised_by_eu_duration(struct perf_query_builder *builder,
+ struct perf_oa_counter *raw)
+{
+ struct perf_oa_counter *counter =
+ &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+ counter->reference0 = raw;
+ counter->reference1 = builder->gpu_core_clock;
+ counter->config = builder->ctx->perfquery.eu_count;
+ counter->read = read_oa_counter_normalized_by_eu_duration_cb;
+ counter->data_type = PERFQUERY_COUNTER_DATA_FLOAT_INTEL;
+
+ return counter;
+}
+
+static void
+read_av_thread_cycles_counter_cb(struct perf_oa_counter *counter,
+ uint64_t *accumulated,
+ void *value) /* uint64 */
+{
+ uint64_t delta = read_uint64_oa_counter(counter->reference0, accumulated);
+ uint64_t spawned = read_uint64_oa_counter(counter->reference1, accumulated);
+ uint64_t *ret = value;
+
+ if (!spawned) {
+ *ret = 0;
+ return;
+ }
+
+ *ret = delta / spawned;
+}
+
+static struct perf_oa_counter *
+add_average_thread_cycles_oa_counter(struct perf_query_builder *builder,
+ struct perf_oa_counter *raw,
+ struct perf_oa_counter *denominator)
+{
+ struct perf_oa_counter *counter =
+ &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+ counter->reference0 = raw;
+ counter->reference1 = denominator;
+ counter->read = read_av_thread_cycles_counter_cb;
+ counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+
+ return counter;
+}
+
+static void
+read_scaled_uint64_counter_cb(struct perf_oa_counter *counter,
+ uint64_t *accumulated,
+ void *value) /* uint64 */
+{
+ uint64_t delta = read_uint64_oa_counter(counter->reference0, accumulated);
+ uint64_t scale = counter->config;
+ uint64_t *ret = value;
+
+ *ret = delta * scale;
+}
+
+static struct perf_oa_counter *
+add_scaled_uint64_oa_counter(struct perf_query_builder *builder,
+ struct perf_oa_counter *input,
+ int scale)
+{
+ struct perf_oa_counter *counter =
+ &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+ counter->reference0 = input;
+ counter->config = scale;
+ counter->read = read_scaled_uint64_counter_cb;
+ counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+
+ return counter;
+}
+
+static void
+read_max_of_float_counters_cb(struct perf_oa_counter *counter,
+ uint64_t *accumulated,
+ void *value) /* float */
+{
+ float counter0 = read_float_oa_counter(counter->reference0, accumulated);
+ float counter1 = read_float_oa_counter(counter->reference1, accumulated);
+ float *ret = value;
+
+ *ret = counter0 >= counter1 ? counter0 : counter1;
+}
+
+
+static struct perf_oa_counter *
+add_max_of_float_oa_counters(struct perf_query_builder *builder,
+ struct perf_oa_counter *counter0,
+ struct perf_oa_counter *counter1)
+{
+ struct perf_oa_counter *counter =
+ &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+ counter->reference0 = counter0;
+ counter->reference1 = counter1;
+ counter->read = read_max_of_float_counters_cb;
+ counter->data_type = PERFQUERY_COUNTER_DATA_FLOAT_INTEL;
+
+ return counter;
+}
+
+static void
+report_uint64_oa_counter_as_raw_uint64(struct perf_query_builder *builder,
+ const char *name,
+ const char *desc,
+ struct perf_oa_counter *oa_counter)
+{
+ struct perf_query_counter *counter =
+ &builder->query->counters[builder->query->n_counters++];
+
+ counter->oa_counter = oa_counter;
+ counter->name = name;
+ counter->desc = desc;
+ counter->type = PERFQUERY_COUNTER_RAW_INTEL;
+ counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+ counter->raw_max = 0; /* undefined range */
+ counter->offset = pot_align(builder->offset, 8);
+ counter->size = sizeof(uint64_t);
+
+ builder->offset = counter->offset + counter->size;
+}
+
+static void
+report_uint64_oa_counter_as_uint64_event(struct perf_query_builder *builder,
+ const char *name,
+ const char *desc,
+ struct perf_oa_counter *oa_counter)
+{
+ struct perf_query_counter *counter =
+ &builder->query->counters[builder->query->n_counters++];
+
+ counter->oa_counter = oa_counter;
+ counter->name = name;
+ counter->desc = desc;
+ counter->type = PERFQUERY_COUNTER_EVENT_INTEL;
+ counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+ counter->offset = pot_align(builder->offset, 8);
+ counter->size = sizeof(uint64_t);
+
+ builder->offset = counter->offset + counter->size;
+}
+
+static void
+report_float_oa_counter_as_percentage_duration(struct perf_query_builder *builder,
+ const char *name,
+ const char *desc,
+ struct perf_oa_counter *oa_counter)
+{
+ struct perf_query_counter *counter =
+ &builder->query->counters[builder->query->n_counters++];
+
+ counter->oa_counter = oa_counter;
+ counter->name = name;
+ counter->desc = desc;
+ counter->type = PERFQUERY_COUNTER_DURATION_RAW_INTEL;
+ counter->data_type = PERFQUERY_COUNTER_DATA_FLOAT_INTEL;
+ counter->raw_max = 100;
+ counter->offset = pot_align(builder->offset, 4);
+ counter->size = sizeof(float);
+
+ builder->offset = counter->offset + counter->size;
+}
+
+static void
+report_uint64_oa_counter_as_throughput(struct perf_query_builder *builder,
+ const char *name,
+ const char *desc,
+ struct perf_oa_counter *oa_counter)
+{
+ struct perf_query_counter *counter =
+ &builder->query->counters[builder->query->n_counters++];
+
+ counter->oa_counter = oa_counter;
+ counter->name = name;
+ counter->desc = desc;
+ counter->type = PERFQUERY_COUNTER_THROUGHPUT_INTEL;
+ counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+ counter->offset = pot_align(builder->offset, 8);
+ counter->size = sizeof(uint64_t);
+
+ builder->offset = counter->offset + counter->size;
+}
+
+static void
+report_uint64_oa_counter_as_duration(struct perf_query_builder *builder,
+ const char *name,
+ const char *desc,
+ struct perf_oa_counter *oa_counter)
+{
+ struct perf_query_counter *counter =
+ &builder->query->counters[builder->query->n_counters++];
+
+ counter->oa_counter = oa_counter;
+ counter->name = name;
+ counter->desc = desc;
+ counter->type = PERFQUERY_COUNTER_DURATION_RAW_INTEL;
+ counter->data_type = PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+ counter->raw_max = 0;
+ counter->offset = pot_align(builder->offset, 8);
+ counter->size = sizeof(uint64_t);
+
+ builder->offset = counter->offset + counter->size;
+}
+
+static void
+add_aggregate_counters(struct perf_query_builder *builder)
+{
+ struct perf_oa_counter *raw;
+ struct perf_oa_counter *active, *stall, *n_threads;
+ struct perf_oa_counter *c;
+ int a_offset = builder->a_offset;
+ int aggregate_active_counter = a_offset + 17; /* aggregate active */
+ int aggregate_stall_counter = a_offset + 18; /* aggregate stall */
+ int n_threads_counter = a_offset + 20;
+
+ raw = add_raw_oa_counter(builder, a_offset + 41);
+ c = add_oa_counter_normalised_by_gpu_duration(builder, raw);
+ report_float_oa_counter_as_percentage_duration(builder,
+ "GPU Busy",
+ "The percentage of time in which the GPU has being processing GPU commands.",
+ c);
+
+ raw = add_raw_oa_counter(builder, a_offset); /* aggregate EU active */
+ c = add_oa_counter_normalised_by_eu_duration(builder, raw);
+ report_float_oa_counter_as_percentage_duration(builder,
+ "EU Active",
+ "The percentage of time in which the Execution Units were actively processing.",
+ c);
+
+ raw = add_raw_oa_counter(builder, a_offset + 1); /* aggregate EU stall */
+ c = add_oa_counter_normalised_by_eu_duration(builder, raw);
+ report_float_oa_counter_as_percentage_duration(builder,
+ "EU Stall",
+ "The percentage of time in which the Execution Units were stalled.",
+ c);
+
+
+ active = add_raw_oa_counter(builder, aggregate_active_counter);
+ c = add_oa_counter_normalised_by_eu_duration(builder, active);
+ report_float_oa_counter_as_percentage_duration(builder,
+ "CS EU Active",
+ "The percentage of time in which compute shader were "
+ "processed actively on the EUs.",
+ c);
+
+ stall = add_raw_oa_counter(builder, aggregate_stall_counter);
+ c = add_oa_counter_normalised_by_eu_duration(builder, stall);
+ report_float_oa_counter_as_percentage_duration(builder,
+ "CS EU Stall",
+ "The percentage of time in which compute shader were "
+ "stalled on the EUs.",
+ c);
+
+
+ n_threads = add_raw_oa_counter(builder, n_threads_counter);
+ c = add_average_thread_cycles_oa_counter(builder, active, n_threads);
+ report_uint64_oa_counter_as_raw_uint64(builder,
+ "CS AVG Active per Thread",
+ "The average number of cycles per hardware "
+ "thread run in which compute shader were processed actively "
+ "on the EUs.",
+ c);
+
+ c = add_average_thread_cycles_oa_counter(builder, stall, n_threads);
+ report_uint64_oa_counter_as_raw_uint64(builder,
+ "CS AVG Stall per Thread",
+ "The average number of cycles per hardware "
+ "thread run in which compute shader were stalled "
+ "on the EUs.",
+ c);
+
+
+ #if 0
+ raw = add_raw_oa_counter(builder, a_offset + 32); /* hiz fast z passing */
+ raw = add_raw_oa_counter(builder, a_offset + 33); /* hiz fast z failing */
+
+ raw = add_raw_oa_counter(builder, a_offset + 42); /* vs bottleneck */
+ raw = add_raw_oa_counter(builder, a_offset + 43); /* gs bottleneck */
+ #endif
+}
+
+static void
+hsw_add_compute_counters(struct perf_query_builder *builder)
+{
+ struct perf_oa_counter *raw0;
+ struct perf_oa_counter *raw1;
+ struct perf_oa_counter *typed_read;
+ struct perf_oa_counter *typed_write;
+ struct perf_oa_counter *typed_atomics;
+ struct perf_oa_counter *untyped_read;
+ struct perf_oa_counter *untyped_write;
+ struct perf_oa_counter *slm_read;
+ struct perf_oa_counter *slm_write;
+
+ raw0 = add_raw_oa_counter(builder, 0xd0>>2);
+ raw0 = add_scaled_uint64_oa_counter(builder, raw0, 64);
+
+ raw1 = add_raw_oa_counter(builder, 0xd4>>2);
+ raw1 = add_scaled_uint64_oa_counter(builder, raw0, 64);
+
+ typed_read = add_hsw_slice_extrapolated_oa_counter(builder, raw0, raw1);
+ report_uint64_oa_counter_as_raw_uint64(builder,
+ "TYPED_BYTES_READ",
+ "TYPED_BYTES_READ",
+ typed_read);
+
+ raw0 = add_raw_oa_counter(builder, 0xd8>>2);
+ raw0 = add_scaled_uint64_oa_counter(builder, raw0, 64);
+
+ raw1 = add_raw_oa_counter(builder, 0xdc>>2);
+ raw1 = add_scaled_uint64_oa_counter(builder, raw1, 64);
+
+ typed_write = add_hsw_slice_extrapolated_oa_counter(builder, raw0, raw1);
+ report_uint64_oa_counter_as_raw_uint64(builder,
+ "TYPED_BYTES_WRITTEN",
+ "TYPED_BYTES_WRITTEN",
+ typed_write);
+
+ raw0 = add_raw_oa_counter(builder, 0xc0>>2);
+ raw0 = add_scaled_uint64_oa_counter(builder, raw0, 64);
+
+ raw1 = add_raw_oa_counter(builder, 0xc4>>2);
+ raw1 = add_scaled_uint64_oa_counter(builder, raw1, 64);
+
+ untyped_read = add_hsw_slice_extrapolated_oa_counter(builder, raw0, raw1);
+ report_uint64_oa_counter_as_raw_uint64(builder,
+ "UNTYPED_BYTES_READ",
+ "UNTYPED_BYTES_READ",
+ untyped_read);
+
+ raw0 = add_raw_oa_counter(builder, 0xc8>>2);
+ raw0 = add_scaled_uint64_oa_counter(builder, raw0, 64);
+
+ raw1 = add_raw_oa_counter(builder, 0xcc>>2);
+ raw1 = add_scaled_uint64_oa_counter(builder, raw1, 64);
+
+ untyped_write = add_hsw_slice_extrapolated_oa_counter(builder, raw0, raw1);
+ report_uint64_oa_counter_as_raw_uint64(builder,
+ "UNTYPED_BYTES_WRITTEN",
+ "UNTYPED_BYTES_WRITTEN",
+ untyped_write);
+
+ raw0 = add_raw_oa_counter(builder, 0xf8>>2);
+ raw0 = add_scaled_uint64_oa_counter(builder, raw0, 64);
+
+ raw1 = add_raw_oa_counter(builder, 0xfc>>2);
+ raw1 = add_scaled_uint64_oa_counter(builder, raw1, 64);
+
+ slm_read = add_hsw_slice_extrapolated_oa_counter(builder, raw0, raw1);
+ report_uint64_oa_counter_as_raw_uint64(builder,
+ "SLM_BYTES_READ",
+ "SLM_BYTES_READ",
+ slm_read);
+
+ raw0 = add_raw_oa_counter(builder, 0xf0>>2);
+ raw0 = add_scaled_uint64_oa_counter(builder, raw0, 64);
+
+ raw1 = add_raw_oa_counter(builder, 0xf4>>2);
+ raw1 = add_scaled_uint64_oa_counter(builder, raw1, 64);
+
+ slm_write = add_hsw_slice_extrapolated_oa_counter(builder, raw0, raw1);
+ report_uint64_oa_counter_as_raw_uint64(builder,
+ "SLM_BYTES_WRITTEN",
+ "SLM_BYTES_WRITTEN",
+ slm_write);
+
+ raw0 = add_raw_oa_counter(builder, 0xe0>>2);
+ raw1 = add_raw_oa_counter(builder, 0xe4>>2);
+ typed_atomics = add_hsw_slice_extrapolated_oa_counter(builder, raw0, raw1);
+ report_uint64_oa_counter_as_raw_uint64(builder,
+ "TYPED_ATOMICS",
+ "TYPED_ATOMICS",
+ typed_atomics);
+}
+
+
+static void
+hsw_add_compute_basic_oa_counter_query(cl_context ctx)
+{
+ struct perf_query_builder builder;
+ struct perf_query *query = &ctx->perfquery.queries[ctx->perfquery.n_queries++];
+ int a_offset;
+ int b_offset;
+ int c_offset;
+ struct perf_oa_counter *elapsed;
+ struct perf_oa_counter *c;
+ struct perf_query_counter *last;
+
+ query->name = "Gen7 Compute Basic Observability Architecture Counters";
+ query->counters = malloc(sizeof(struct perf_query_counter) *
+ MAX_PERF_QUERY_COUNTERS);
+ query->n_counters = 0;
+ query->oa_counters = malloc(sizeof(struct perf_oa_counter) *
+ MAX_OA_QUERY_COUNTERS);
+ query->n_oa_counters = 0;
+ query->oa_metrics_set = I915_OA_METRICS_SET_COMPUTE;
+ query->oa_format = I915_OA_FORMAT_A45_B8_C8_HSW;
+ // query->perf_raw_size = 256; /* XXX */
+
+ builder.ctx = ctx;
+ builder.query = query;
+ builder.offset = 0;
+ builder.next_accumulator_index = 0;
+
+ builder.a_offset = a_offset = 3;
+ builder.b_offset = b_offset = a_offset + 45;
+ builder.c_offset = c_offset = b_offset + 8;
+
+ /* Can be referenced by other counters... */
+ builder.gpu_core_clock = add_raw_oa_counter(&builder, c_offset + 2);
+
+ elapsed = add_hsw_elapsed_oa_counter(&builder);
+ report_uint64_oa_counter_as_duration(&builder,
+ "GPU Time Elapsed",
+ "Time elapsed on the GPU during the measurement.",
+ elapsed);
+
+ c = add_avg_frequency_oa_counter(&builder, elapsed);
+ report_uint64_oa_counter_as_uint64_event(&builder,
+ "AVG GPU Core Frequency",
+ "Average GPU Core Frequency in the measurement.",
+ c);
+
+ add_aggregate_counters(&builder);
+
+ hsw_add_compute_counters(&builder);
+
+ assert(query->n_counters < MAX_PERF_QUERY_COUNTERS);
+ assert(query->n_oa_counters < MAX_OA_QUERY_COUNTERS);
+
+ last = &query->counters[query->n_counters - 1];
+ query->data_size = last->offset + last->size;
+}
+
+static int
+get_oa_counter_data(cl_context ctx,
+ struct perf_query_object *obj,
+ size_t data_size,
+ uint8_t *data)
+{
+ const struct perf_query *query = obj->query;
+ int n_counters = query->n_counters;
+ int written = 0, i;
+
+ if (!obj->oa.results_accumulated) {
+ accumulate_oa_snapshots(ctx, obj);
+ assert(obj->oa.results_accumulated);
+ }
+
+ for (i = 0; i < n_counters; i++) {
+ const struct perf_query_counter *counter = &query->counters[i];
+
+ if (counter->size) {
+ counter->oa_counter->read(counter->oa_counter, obj->oa.accumulator,
+ data + counter->offset);
+ written = counter->offset + counter->size;
+ }
+ }
+
+ return written;
+}
+
+/**
+ * Get the performance query result.
+ */
+static void
+get_perf_query_data(cl_context ctx,
+ struct perf_query_object *obj,
+ size_t data_size,
+ cl_uint *data,
+ cl_uint *bytes_written)
+{
+ int written = 0;
+
+ //assert(is_perf_query_ready(ctx, queue, obj));
+
+ /* XXX handle in flags */
+ wait_perf_query(ctx, obj);
+
+ written = get_oa_counter_data(ctx, obj, data_size, (uint8_t *)data);
+
+ if (bytes_written)
+ *bytes_written = written;
+}
+
+static void
+close_perf(cl_context ctx)
+{
+ if (ctx->perfquery.perf_oa_event_fd != -1) {
+ if (ctx->perfquery.perf_oa_mmap_base) {
+ size_t mapping_len =
+ ctx->perfquery.perf_oa_buffer_size + ctx->perfquery.page_size;
+
+ munmap(ctx->perfquery.perf_oa_mmap_base, mapping_len);
+ ctx->perfquery.perf_oa_mmap_base = NULL;
+ }
+
+ close(ctx->perfquery.perf_oa_event_fd);
+ ctx->perfquery.perf_oa_event_fd = -1;
+ }
+}
+
+void
+intel_perf_query_first(cl_context ctx, cl_uint *queryId)
+{
+ *queryId = 0;
+}
+
+void
+intel_perf_query_next(cl_context ctx, cl_uint queryId, cl_uint *nextId)
+{
+ if (queryId < ctx->perfquery.n_queries - 1)
+ *nextId = queryId + 1;
+ else
+ *nextId = queryId;
+}
+
+void
+intel_perf_query_info(cl_context ctx,
+ cl_uint queryId,
+ cl_char **queryName,
+ cl_uint *dataSize,
+ cl_uint *noCounters,
+ cl_uint *noInstances)
+{
+ struct perf_query *query = &ctx->perfquery.queries[queryId];
+
+ *queryName = query->name;
+ *dataSize = query->data_size;
+ *noCounters = query->n_counters;
+ *noInstances = 1; /* current initial allows 1 instance */
+}
+
+void
+intel_perf_counter_info(cl_context ctx,
+ cl_uint queryId, cl_uint counterId,
+ cl_char **counterName,
+ cl_char **counterDesc,
+ cl_uint *counterOffset, cl_uint *counterDataSize,
+ cl_uint *counterTypeEnum, cl_uint *counterDataTypeEnum,
+ cl_ulong *rawCounterMaxValue)
+{
+ struct perf_query *query = &ctx->perfquery.queries[queryId];
+ struct perf_query_counter *counter = &query->counters[counterId];
+
+ *counterName = counter->name;
+ *counterDesc = counter->desc;
+ *counterOffset = counter->offset;
+ *counterDataSize = counter->size;
+ *counterTypeEnum = counter->type;
+ *counterDataTypeEnum = counter->data_type;
+ *rawCounterMaxValue = counter->raw_max;
+}
+
+cl_int
+intel_perf_query_create(cl_context context, cl_uint queryId, cl_perf_query_intel *queryHandle)
+{
+ struct perf_query *query = &context->perfquery.queries[queryId];
+ struct perf_query_object *obj =
+ calloc(1, sizeof(struct perf_query_object));
+
+ if (!obj)
+ return -1;
+
+ obj->query = query;
+
+ obj->batch = intel_batchbuffer_new((intel_driver_t *)context->drv);
+ if (!obj->batch) {
+ fprintf(stderr, "failed to create perf batch\n");
+ free(obj);
+ return -1;
+ }
+
+ context->perfquery.n_query_instances++;
+
+ *queryHandle = obj;
+
+ return 0;
+}
+
+cl_int
+intel_perf_query_delete(cl_context context, cl_perf_query_intel queryHandle)
+{
+ struct perf_query_object *obj = (struct perf_query_object *)queryHandle;
+
+ if (obj->oa.bo) {
+ if (!obj->oa.results_accumulated) {
+ drop_from_unresolved_query_list(context, obj);
+ dec_n_oa_users(context);
+ }
+
+ drm_intel_bo_unreference(obj->oa.bo);
+ obj->oa.bo = NULL;
+ }
+
+ obj->oa.results_accumulated = false;
+
+ intel_batchbuffer_delete(obj->batch);
+
+ free(obj);
+
+ if (--context->perfquery.n_query_instances == 0)
+ close_perf(context);
+
+ return 0;
+}
+
+cl_int
+intel_perf_query_begin(cl_context context, cl_perf_query_intel queryHandle)
+{
+ bool ret;
+ struct perf_query_object *obj = (struct perf_query_object *)queryHandle;
+
+ ret = begin_perf_query(context, obj);
+ if (!ret)
+ return -1;
+
+ return 0;
+}
+
+cl_int
+intel_perf_query_end(cl_context context, cl_perf_query_intel queryHandle)
+{
+ struct perf_query_object *obj = (struct perf_query_object *)queryHandle;
+
+ end_perf_query(context, obj);
+ return 0;
+}
+
+cl_int
+intel_perf_query_get_data(cl_context context,
+ cl_perf_query_intel queryHandle,
+ cl_uint flags, size_t dataSize, void *data,
+ cl_uint *bytesWritten)
+{
+ struct perf_query_object *obj = (struct perf_query_object *)queryHandle;
+
+ /* XXX flags? */
+ get_perf_query_data(context, obj,
+ dataSize, data,
+ bytesWritten);
+
+ return 0;
+}
+
+
+void
+intel_perf_query_init(cl_context context)
+{
+ intel_driver_t *drv = (intel_driver_t *)context->drv;
+
+ if (!IS_HASWELL(drv->device_id)) {
+ fprintf(stderr, "Perf query only supports on HSW now.\n");
+ context->perfquery.enable = false;
+ return;
+ }
+
+ /* XXX test kernel for i915 PMU driver available */
+ if (lookup_i915_oa_id() == 0) {
+ fprintf(stderr, "Kernel has no i915 PMU driver.\n");
+ context->perfquery.enable = false;
+ return;
+ }
+
+ context->perfquery.eu_count = context->device->max_compute_unit;
+
+ context->perfquery.read_oa_report_timestamp = hsw_read_report_timestamp;
+
+ /* initialize intel query structs depends on device */
+ hsw_add_compute_basic_oa_counter_query(context);
+
+ context->perfquery.unresolved = calloc(2, sizeof(struct perf_query_object *));
+ context->perfquery.unresolved_elements = 0;
+ context->perfquery.unresolved_array_size = 2;
+
+ context->perfquery.page_size = sysconf(_SC_PAGE_SIZE);
+
+ context->perfquery.perf_oa_event_fd = -1;
+ context->perfquery.perf_oa_buffer_size = 1024 * 1024; /* NB: must be power of two */
+
+ context->perfquery.next_query_start_report_id = 1000;
+
+ context->perfquery.enable = true;
+}
+
+void
+intel_perf_query_destroy(cl_context context)
+{
+ int i;
+ struct perf_query *query;
+
+ for (i = 0; i < context->perfquery.n_queries; i++) {
+ query = &context->perfquery.queries[i];
+ free(query->counters);
+ free(query->oa_counters);
+ }
+ free(context->perfquery.unresolved);
+}
diff --git a/src/intel_perf.h b/src/intel_perf.h
new file mode 100644
index 0000000..5099433
--- /dev/null
+++ b/src/intel_perf.h
@@ -0,0 +1,32 @@
+#ifndef _INTEL_PERF_H
+#define _INTEL_PERF_H
+
+#include "CL/cl.h"
+#include "CL/cl_intel.h"
+
+void intel_perf_query_first(cl_context, cl_uint *queryId);
+void intel_perf_query_next(cl_context, cl_uint queryId, cl_uint *nextId);
+void intel_perf_query_info(cl_context, cl_uint queryId,
+ cl_char **queryName,
+ cl_uint *dataSize, cl_uint *noCounters, cl_uint *noInstances);
+void intel_perf_counter_info(cl_context, cl_uint queryId, cl_uint counterId,
+ cl_char **counterName,
+ cl_char **counterDesc,
+ cl_uint *counterOffset, cl_uint *counterDataSize,
+ cl_uint *counterTypeEnum, cl_uint *counterDataTypeEnum,
+ cl_ulong *rawCounterMaxValue);
+cl_int intel_perf_query_create(cl_context context, cl_uint queryId,
+ cl_perf_query_intel *queryHandle);
+cl_int intel_perf_query_delete(cl_context context, cl_perf_query_intel queryHandle);
+cl_int intel_perf_query_begin(cl_context context, cl_perf_query_intel queryHandle);
+cl_int intel_perf_query_end(cl_context context, cl_perf_query_intel queryHandle);
+cl_int intel_perf_query_get_data(cl_context context,
+ cl_perf_query_intel queryHandle,
+ cl_uint flags, size_t dataSize, void *data,
+ cl_uint *bytesWritten);
+
+void intel_perf_query_init(cl_context context);
+void intel_perf_query_destroy(cl_context context);
+
+#endif
+
--
2.1.4
More information about the Beignet
mailing list