[Intel-gfx] [RFC 2/8] drm/i915: Add mechanism for forwarding the timestamp data through perf

Wed Jul 15 01:51:40 PDT 2015

From: Sourab Gupta <sourab.gupta at intel.com>

This patch adds the mechanism for forwarding the timestamp data to
userspace using the Gen PMU perf event interface.

The timestamps will be captured in a gem buffer object. The metadata
information (ctx_id right now) pertaining to snapshot is maintained in a
list, whose each node has offsets into the gem buffer object for each
snapshot captured.
In order to track whether the gpu has completed processing the node,
a field pertaining to corresponding gem request is added. The request is
expected to be referenced whenever the gpu command is submitted.

Each snapshot collected is forwarded as a separate perf sample. The perf
sample will have raw timestamp data followed by metadata information
pertaining to that sample.
While forwarding the samples, we check whether the gem request is completed
and dereference the corresponding request. The need to dereference the
request necessitates a worker here, which will be scheduled when the
hrtimer triggers.
While flushing the samples, we have to wait for the requests already
scheduled, before forwarding the samples. This wait is done in a lockless
fashion.

Signed-off-by: Sourab Gupta <sourab.gupta at intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h     |  11 ++++
 drivers/gpu/drm/i915/i915_oa_perf.c | 118 +++++++++++++++++++++++++++++++++++-
 include/uapi/drm/i915_drm.h         |  10 +++
 3 files changed, 137 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 7e8e77b..6984150 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1675,6 +1675,13 @@ struct i915_oa_rcs_node {
 	u32 tag;
 };
 
+struct i915_gen_pmu_node {
+	struct list_head head;
+	struct drm_i915_gem_request *req;
+	u32 offset;
+	u32 ctx_id;
+};
+
 extern const struct i915_oa_reg i915_oa_3d_mux_config_hsw[];
 extern const int i915_oa_3d_mux_config_hsw_len;
 extern const struct i915_oa_reg i915_oa_3d_b_counter_config_hsw[];
@@ -1996,7 +2003,11 @@ struct drm_i915_private {
 		struct {
 			struct drm_i915_gem_object *obj;
 			u8 *addr;
+			u32 node_size;
+			u32 node_count;
 		} buffer;
+		struct list_head node_list;
+		struct work_struct work_timer;
 	} gen_pmu;
 
 	void (*insert_profile_cmd[I915_PROFILE_MAX])
diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c
index ab965b4..350b560 100644
--- a/drivers/gpu/drm/i915/i915_oa_perf.c
+++ b/drivers/gpu/drm/i915/i915_oa_perf.c
@@ -408,11 +408,108 @@ void forward_oa_rcs_snapshots_work(struct work_struct *__work)
 	}
 }
 
+int i915_gen_pmu_wait_gpu(struct drm_i915_private *dev_priv)
+{
+	struct i915_gen_pmu_node *last_entry;
+	unsigned long lock_flags;
+	int ret;
+
+	/*
+	 * Wait for the last scheduled request to complete. This would
+	 * implicitly wait for the prior submitted requests. The refcount
+	 * of the requests is not decremented here.
+	 */
+	spin_lock_irqsave(&dev_priv->gen_pmu.lock, lock_flags);
+
+	if (list_empty(&dev_priv->gen_pmu.node_list)) {
+		spin_unlock_irqrestore(&dev_priv->gen_pmu.lock, lock_flags);
+		return 0;
+	}
+	last_entry = list_last_entry(&dev_priv->gen_pmu.node_list,
+			struct i915_gen_pmu_node, head);
+	spin_unlock_irqrestore(&dev_priv->gen_pmu.lock, lock_flags);
+
+	if (last_entry && last_entry->req) {
+		ret = __i915_wait_request(last_entry->req, atomic_read(
+				&dev_priv->gpu_error.reset_counter),
+				dev_priv->mm.interruptible, NULL, NULL);
+		if (ret) {
+			DRM_ERROR("failed to wait\n");
+			return ret;
+		}
+	}
+	return 0;
+}
+
+static void forward_one_gen_pmu_sample(struct drm_i915_private *dev_priv,
+				struct i915_gen_pmu_node *node)
+{
+	struct perf_sample_data data;
+	struct perf_event *event = dev_priv->gen_pmu.exclusive_event;
+	int ts_size, snapshot_size;
+	u8 *snapshot;
+	struct drm_i915_ts_node_ctx_id *ctx_info;
+	struct perf_raw_record raw;
+
+	ts_size = sizeof(struct drm_i915_ts_data);
+	snapshot_size = ts_size + sizeof(*ctx_info);
+	snapshot = dev_priv->gen_pmu.buffer.addr + node->offset;
+
+	ctx_info = (struct drm_i915_ts_node_ctx_id *)(snapshot + ts_size);
+	ctx_info->ctx_id = node->ctx_id;
+
+	perf_sample_data_init(&data, 0, event->hw.last_period);
+
+	/* Note: the combined u32 raw->size member + raw data itself must be 8
+	 * byte aligned. (See note in init_gen_pmu_buffer for more details) */
+	raw.size = snapshot_size + 4;
+	raw.data = snapshot;
+
+	data.raw = &raw;
+
+	perf_event_overflow(event, &data, &dev_priv->gen_pmu.dummy_regs);
+}
+
+void forward_gen_pmu_snapshots_work(struct work_struct *__work)
+{
+	struct drm_i915_private *dev_priv =
+		container_of(__work, typeof(*dev_priv), gen_pmu.work_timer);
+	struct i915_gen_pmu_node *entry, *next;
+	struct drm_i915_gem_request *req;
+	unsigned long lock_flags;
+	int ret;
+
+	list_for_each_entry_safe
+		(entry, next, &dev_priv->gen_pmu.node_list, head) {
+		req = entry->req;
+		if (req && i915_gem_request_completed(req, true)) {
+			forward_one_gen_pmu_sample(dev_priv, entry);
+			ret = i915_mutex_lock_interruptible(dev_priv->dev);
+			if (ret)
+				break;
+			i915_gem_request_assign(&entry->req, NULL);
+			mutex_unlock(&dev_priv->dev->struct_mutex);
+		} else
+			break;
+
+		/*
+		 * Do we instead need to protect whole loop? If so, we would
+		 * need to *list_move_tail* to a deferred list, from where
+		 * i915 device mutex could be taken to deference the requests,
+		 * and free the node.
+		 */
+		spin_lock_irqsave(&dev_priv->gen_pmu.lock, lock_flags);
+		list_del(&entry->head);
+		spin_unlock_irqrestore(&dev_priv->gen_pmu.lock, lock_flags);
+		kfree(entry);
+	}
+}
+
 static void gen_pmu_flush_snapshots(struct drm_i915_private *dev_priv)
 {
 	WARN_ON(!dev_priv->gen_pmu.buffer.addr);
 
-	/* TODO: routine for forwarding snapshots to userspace */
+	schedule_work(&dev_priv->gen_pmu.work_timer);
 }
 
 static void
@@ -761,7 +858,7 @@ static int init_gen_pmu_buffer(struct perf_event *event)
 	struct drm_i915_private *dev_priv =
 		container_of(event->pmu, typeof(*dev_priv), gen_pmu.pmu);
 	struct drm_i915_gem_object *bo;
-	int ret;
+	int ret, node_size;
 
 	BUG_ON(dev_priv->gen_pmu.buffer.obj);
 
@@ -772,6 +869,15 @@ static int init_gen_pmu_buffer(struct perf_event *event)
 	dev_priv->gen_pmu.buffer.obj = bo;
 
 	dev_priv->gen_pmu.buffer.addr = vmap_oa_buffer(bo);
+	INIT_LIST_HEAD(&dev_priv->gen_pmu.node_list);
+
+	node_size = sizeof(struct drm_i915_ts_data) +
+			sizeof(struct drm_i915_ts_node_ctx_id);
+
+	/* size has to be aligned to 8 bytes (required by relevant gpu cmds) */
+	node_size = ALIGN(node_size, 8);
+	dev_priv->gen_pmu.buffer.node_size = node_size;
+	dev_priv->gen_pmu.buffer.node_count = bo->base.size / node_size;
 
 	DRM_DEBUG_DRIVER("Gen PMU Buffer initialized, vaddr = %p",
 			 dev_priv->gen_pmu.buffer.addr);
@@ -1397,6 +1503,11 @@ static int i915_gen_event_flush(struct perf_event *event)
 {
 	struct drm_i915_private *i915 =
 		container_of(event->pmu, typeof(*i915), gen_pmu.pmu);
+	int ret;
+
+	ret = i915_gen_pmu_wait_gpu(i915);
+	if (ret)
+		return ret;
 
 	gen_pmu_flush_snapshots(i915);
 	return 0;
@@ -1548,6 +1659,7 @@ void i915_gen_pmu_register(struct drm_device *dev)
 	hrtimer_init(&i915->gen_pmu.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	i915->gen_pmu.timer.function = hrtimer_sample_gen;
 
+	INIT_WORK(&i915->gen_pmu.work_timer, forward_gen_pmu_snapshots_work);
 	spin_lock_init(&i915->gen_pmu.lock);
 
 	i915->gen_pmu.pmu.capabilities  = PERF_PMU_CAP_IS_DEVICE;
@@ -1577,6 +1689,8 @@ void i915_gen_pmu_unregister(struct drm_device *dev)
 	if (i915->gen_pmu.pmu.event_init == NULL)
 		return;
 
+	cancel_work_sync(&i915->gen_pmu.work_timer);
+
 	perf_pmu_unregister(&i915->gen_pmu.pmu);
 	i915->gen_pmu.pmu.event_init = NULL;
 }
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 1084178..9c083a2 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -140,6 +140,16 @@ struct drm_i915_oa_node_tag {
 	__u32 pad;
 };
 
+struct drm_i915_ts_data {
+	__u32 ts_low;
+	__u32 ts_high;
+};
+
+struct drm_i915_ts_node_ctx_id {
+	__u32 ctx_id;
+	__u32 pad;
+};
+
 /* Each region is a minimum of 16k, and there are at most 255 of them.
  */
 #define I915_NR_TEX_REGIONS 255	/* table size 2k - maximum due to use
-- 
1.8.5.1