[Intel-gfx] [RFC 4/8] drm/i915: Add mechanism for forwarding async OA counter snapshots through perf
sourab.gupta at intel.com
sourab.gupta at intel.com
Mon Jun 22 02:50:15 PDT 2015
From: Sourab Gupta <sourab.gupta at intel.com>
This patch adds the mechanism for forwarding the asynchronous OA snapshots
through the perf event interface.
Each node of data collected is forwarded as a separate perf sample.
A single snapshot will have two fields. First is the raw report and second
field is a footer with metadata corresponding to snapshot such as ctx_id, pid.
The size of the raw report is the one specified during event init.
The samples will be forwarded in a workqueue, which is scheduled when hrtimer
triggers. In the workqueue, each node of data collected will be forwarded as a
separate perf sample.
Signed-off-by: Sourab Gupta <sourab.gupta at intel.com>
---
drivers/gpu/drm/i915/i915_drv.h | 5 +-
drivers/gpu/drm/i915/i915_oa_perf.c | 158 +++++++++++++++++++++++++++++++++++-
2 files changed, 161 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index da150bc..d738f7a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1691,7 +1691,8 @@ struct drm_i915_oa_async_queue_header {
struct drm_i915_oa_async_node_info {
__u32 pid;
__u32 ctx_id;
- __u32 pad[14];
+ struct drm_i915_gem_request *req;
+ __u32 pad[12];
};
struct drm_i915_oa_async_node {
@@ -1975,7 +1976,9 @@ struct drm_i915_private {
u32 tail;
int format;
int format_size;
+ u8 *snapshot;
} oa_async_buffer;
+ struct work_struct work_timer;
} oa_pmu;
#endif
diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c
index 419b6a5..3bf4c47 100644
--- a/drivers/gpu/drm/i915/i915_oa_perf.c
+++ b/drivers/gpu/drm/i915/i915_oa_perf.c
@@ -25,6 +25,128 @@ static int hsw_perf_format_sizes[] = {
64 /* C4_B8_HSW */
};
+static void init_oa_async_buf_queue(struct drm_i915_private *dev_priv)
+{
+ struct drm_i915_oa_async_queue_header *hdr =
+ (struct drm_i915_oa_async_queue_header *)
+ dev_priv->oa_pmu.oa_async_buffer.addr;
+ void *data_ptr;
+
+ hdr->size_in_bytes = dev_priv->oa_pmu.oa_async_buffer.obj->base.size;
+ /* 64 bit alignment for OA node address */
+ data_ptr = PTR_ALIGN((void *)(hdr + 1), 64);
+ hdr->data_offset = (__u64)(data_ptr - (void *)hdr);
+
+ hdr->node_count = 0;
+ hdr->wrap_count = 0;
+}
+
+static void forward_one_oa_async_sample(struct drm_i915_private *dev_priv,
+ struct drm_i915_oa_async_node *node)
+{
+ struct perf_sample_data data;
+ struct perf_event *event = dev_priv->oa_pmu.exclusive_event;
+ int format_size, snapshot_size;
+ u8 *snapshot;
+ struct perf_raw_record raw;
+
+ format_size = dev_priv->oa_pmu.oa_async_buffer.format_size;
+ snapshot_size = format_size +
+ sizeof(struct drm_i915_oa_async_node_footer);
+ snapshot = dev_priv->oa_pmu.oa_async_buffer.snapshot;
+
+ memcpy(snapshot, node, format_size);
+ memcpy(snapshot + format_size, &node->node_info,
+ sizeof(struct drm_i915_oa_async_node_footer));
+
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+
+ /* Note: the combined u32 raw->size member + raw data itself must be 8
+ * byte aligned. (See note in init_oa_buffer for more details) */
+ raw.size = snapshot_size + 4;
+ raw.data = snapshot;
+
+ data.raw = &raw;
+
+ perf_event_overflow(event, &data, &dev_priv->oa_pmu.dummy_regs);
+}
+
+void i915_oa_async_wait_gpu(struct drm_i915_private *dev_priv)
+{
+ struct drm_i915_oa_async_queue_header *hdr =
+ (struct drm_i915_oa_async_queue_header *)
+ dev_priv->oa_pmu.oa_async_buffer.addr;
+ struct drm_i915_oa_async_node *first_node, *node;
+ int ret, head, tail, num_nodes;
+ struct drm_i915_gem_request *req;
+
+ first_node = (struct drm_i915_oa_async_node *)
+ ((char *)hdr + hdr->data_offset);
+ num_nodes = (hdr->size_in_bytes - hdr->data_offset) /
+ sizeof(*node);
+
+
+ tail = hdr->node_count;
+ head = dev_priv->oa_pmu.oa_async_buffer.head;
+
+ /* wait for all requests to complete*/
+ while ((head % num_nodes) != (tail % num_nodes)) {
+ node = &first_node[head % num_nodes];
+ req = node->node_info.req;
+ if (req) {
+ if (!i915_gem_request_completed(req, true)) {
+ ret = i915_wait_request(req);
+ if (ret)
+ DRM_DEBUG_DRIVER(
+ "oa async: failed to wait\n");
+ }
+ i915_gem_request_assign(&node->node_info.req, NULL);
+ }
+ head++;
+ }
+}
+
+void forward_oa_async_snapshots_work(struct work_struct *__work)
+{
+ struct drm_i915_private *dev_priv =
+ container_of(__work, typeof(*dev_priv),
+ oa_pmu.work_timer);
+ struct drm_i915_oa_async_queue_header *hdr =
+ (struct drm_i915_oa_async_queue_header *)
+ dev_priv->oa_pmu.oa_async_buffer.addr;
+ struct drm_i915_oa_async_node *first_node, *node;
+ int ret, head, tail, num_nodes;
+ struct drm_i915_gem_request *req;
+
+ first_node = (struct drm_i915_oa_async_node *)
+ ((char *)hdr + hdr->data_offset);
+ num_nodes = (hdr->size_in_bytes - hdr->data_offset) /
+ sizeof(*node);
+
+ ret = i915_mutex_lock_interruptible(dev_priv->dev);
+ if (ret)
+ return;
+
+ tail = hdr->node_count;
+ head = dev_priv->oa_pmu.oa_async_buffer.head;
+
+ while ((head % num_nodes) != (tail % num_nodes)) {
+ node = &first_node[head % num_nodes];
+ req = node->node_info.req;
+ if (req && i915_gem_request_completed(req, true)) {
+ forward_one_oa_async_sample(dev_priv, node);
+ i915_gem_request_assign(&node->node_info.req, NULL);
+ head++;
+ } else
+ break;
+ }
+
+ dev_priv->oa_pmu.oa_async_buffer.tail = tail;
+ dev_priv->oa_pmu.oa_async_buffer.head = head;
+
+ mutex_unlock(&dev_priv->dev->struct_mutex);
+}
+
static void forward_one_oa_snapshot_to_event(struct drm_i915_private *dev_priv,
u8 *snapshot,
struct perf_event *event)
@@ -58,6 +180,14 @@ static u32 forward_oa_snapshots(struct drm_i915_private *dev_priv,
u8 *snapshot;
u32 taken;
+ /*
+ * Schedule a wq to forward the async samples collected. We schedule
+ * wq here, since it requires device mutex to be taken which can't be
+ * done here because of atomic context
+ */
+ if (dev_priv->oa_pmu.async_sample_mode)
+ schedule_work(&dev_priv->oa_pmu.work_timer);
+
head -= dev_priv->oa_pmu.oa_buffer.gtt_offset;
tail -= dev_priv->oa_pmu.oa_buffer.gtt_offset;
@@ -176,6 +306,8 @@ oa_async_buffer_destroy(struct drm_i915_private *i915)
i915->oa_pmu.oa_async_buffer.obj = NULL;
i915->oa_pmu.oa_async_buffer.addr = NULL;
+ kfree(i915->oa_pmu.oa_async_buffer.snapshot);
+
mutex_unlock(&i915->dev->struct_mutex);
}
@@ -358,7 +490,7 @@ static int init_async_oa_buffer(struct perf_event *event)
struct drm_i915_private *dev_priv =
container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu);
struct drm_i915_gem_object *bo;
- int ret;
+ int snapshot_size, ret;
BUG_ON(!IS_HASWELL(dev_priv->dev));
BUG_ON(dev_priv->oa_pmu.oa_async_buffer.obj);
@@ -374,6 +506,12 @@ static int init_async_oa_buffer(struct perf_event *event)
dev_priv->oa_pmu.oa_async_buffer.obj = bo;
dev_priv->oa_pmu.oa_async_buffer.addr = vmap_oa_buffer(bo);
+ init_oa_async_buf_queue(dev_priv);
+
+ snapshot_size = dev_priv->oa_pmu.oa_async_buffer.format_size +
+ sizeof(struct drm_i915_oa_async_node_footer);
+ dev_priv->oa_pmu.oa_async_buffer.snapshot =
+ kmalloc(snapshot_size, GFP_KERNEL);
DRM_DEBUG_DRIVER("OA Async Buffer initialized, vaddr = %p",
dev_priv->oa_pmu.oa_async_buffer.addr);
@@ -814,6 +952,11 @@ static void i915_oa_event_stop(struct perf_event *event, int flags)
flush_oa_snapshots(dev_priv, false);
}
+ if (dev_priv->oa_pmu.async_sample_mode) {
+ dev_priv->oa_pmu.oa_async_buffer.tail = 0;
+ dev_priv->oa_pmu.oa_async_buffer.head = 0;
+ }
+
event->hw.state = PERF_HES_STOPPED;
}
@@ -844,7 +987,15 @@ static int i915_oa_event_flush(struct perf_event *event)
if (event->attr.sample_period) {
struct drm_i915_private *i915 =
container_of(event->pmu, typeof(*i915), oa_pmu.pmu);
+ int ret;
+ if (i915->oa_pmu.async_sample_mode) {
+ ret = i915_mutex_lock_interruptible(i915->dev);
+ if (ret)
+ return ret;
+ i915_oa_async_wait_gpu(i915);
+ mutex_unlock(&i915->dev->struct_mutex);
+ }
flush_oa_snapshots(i915, true);
}
@@ -940,6 +1091,8 @@ void i915_oa_pmu_register(struct drm_device *dev)
hrtimer_init(&i915->oa_pmu.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
i915->oa_pmu.timer.function = hrtimer_sample;
+ INIT_WORK(&i915->oa_pmu.work_timer, forward_oa_async_snapshots_work);
+
spin_lock_init(&i915->oa_pmu.lock);
i915->oa_pmu.pmu.capabilities = PERF_PMU_CAP_IS_DEVICE;
@@ -969,6 +1122,9 @@ void i915_oa_pmu_unregister(struct drm_device *dev)
if (i915->oa_pmu.pmu.event_init == NULL)
return;
+ if (i915->oa_pmu.async_sample_mode)
+ cancel_work_sync(&i915->oa_pmu.work_timer);
+
unregister_sysctl_table(i915->oa_pmu.sysctl_header);
perf_pmu_unregister(&i915->oa_pmu.pmu);
--
1.8.5.1
More information about the Intel-gfx
mailing list