[Intel-gfx] [RFC 3/8] drm/i915: Add mechanism for forwarding CS based OA counter snapshots through perf

Tue Aug 4 22:52:52 PDT 2015

From: Sourab Gupta <sourab.gupta at intel.com>

This patch adds the mechanism for forwarding the CS based OA snapshots
through the perf event interface.

The OA snapshots will be captured in a gem buffer object. The metadata
information (ctx global id, as of now) pertaining to snapshot is maintained
in a list, which has offsets into the gem buffer object for each snapshot
captured.

Each snapshot collected is forwarded as a separate perf sample. The perf
sample will have raw OA report followed by metadata information pertaining
to that sample. The size of the OA report is the one specified during
event init.

In order to track whether the gpu has completed processing the node, a
field pertaining to corresponding gem request is added. The request is
expected to be referenced whenever the gpu command is submitted.

While forwarding the samples, we check whether the gem request is completed
and dereference the corresponding request. The need to dereference the
request necessitates a worker here, which will be scheduled when the
hrtimer triggers.

While flushing the samples, we have to wait for the requests already
scheduled, before forwarding the samples. This wait is in a lockless
fashion.

v2: Changes here pertaining to (as suggested by Chris):
    - Forwarding functionality implemented in a separate fn. The work item
      (scheduled from hrtimer/event stop) would be calling that function.
      The event flush would directly call this forwarding fn. This meets
      the flush semantics.
    - use spin_lock instead of spin_lock_irqsave
    - Code restructuring & better nomenclature

Signed-off-by: Sourab Gupta <sourab.gupta at intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h     |  11 +++
 drivers/gpu/drm/i915/i915_oa_perf.c | 145 +++++++++++++++++++++++++++++++++++-
 include/uapi/drm/i915_drm.h         |   5 ++
 3 files changed, 160 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 050bdda..87e7cf0 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1654,6 +1654,13 @@ struct i915_oa_reg {
 	u32 value;
 };
 
+struct i915_oa_rcs_node {
+	struct list_head head;
+	struct drm_i915_gem_request *req;
+	u32 offset;
+	u32 ctx_id;
+};
+
 extern const struct i915_oa_reg i915_oa_3d_mux_config_hsw[];
 extern const int i915_oa_3d_mux_config_hsw_len;
 extern const struct i915_oa_reg i915_oa_3d_b_counter_config_hsw[];
@@ -1954,7 +1961,11 @@ struct drm_i915_private {
 			u8 *addr;
 			int format;
 			int format_size;
+			u32 node_size;
+			u32 node_count;
 		} oa_rcs_buffer;
+		struct list_head node_list;
+		struct work_struct forward_work;
 	} oa_pmu;
 #endif
 
diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c
index fd0c3a0..3948b45 100644
--- a/drivers/gpu/drm/i915/i915_oa_perf.c
+++ b/drivers/gpu/drm/i915/i915_oa_perf.c
@@ -58,6 +58,14 @@ static u32 forward_oa_snapshots(struct drm_i915_private *dev_priv,
 	u8 *snapshot;
 	u32 taken;
 
+	/*
+	 * Schedule a worker to forward the RCS based OA reports collected.
+	 * A worker is needed since it requires device mutex to be taken
+	 * which can't be done here because of atomic context
+	 */
+	if (dev_priv->oa_pmu.multiple_ctx_mode)
+		schedule_work(&dev_priv->oa_pmu.forward_work);
+
 	head -= dev_priv->oa_pmu.oa_buffer.gtt_offset;
 	tail -= dev_priv->oa_pmu.oa_buffer.gtt_offset;
 
@@ -165,6 +173,119 @@ static void flush_oa_snapshots(struct drm_i915_private *dev_priv,
 	spin_unlock_irqrestore(&dev_priv->oa_pmu.oa_buffer.flush_lock, flags);
 }
 
+static int i915_oa_rcs_wait_gpu(struct drm_i915_private *dev_priv)
+{
+	struct i915_oa_rcs_node *last_entry = NULL;
+	int ret = 0;
+
+	/*
+	 * Wait for the last scheduled request to complete. This would
+	 * implicitly wait for the prior submitted requests. The refcount
+	 * of the requests is not decremented here.
+	 */
+	spin_lock(&dev_priv->oa_pmu.lock);
+
+	if (!list_empty(&dev_priv->oa_pmu.node_list)) {
+		last_entry = list_last_entry(&dev_priv->oa_pmu.node_list,
+			struct i915_oa_rcs_node, head);
+	}
+	spin_unlock(&dev_priv->oa_pmu.lock);
+
+	if (!last_entry)
+		return 0;
+
+	ret = __i915_wait_request(last_entry->req, atomic_read(
+			&dev_priv->gpu_error.reset_counter),
+			true, NULL, NULL);
+	if (ret) {
+		DRM_ERROR("failed to wait\n");
+		return ret;
+	}
+	return 0;
+}
+
+static void forward_one_oa_rcs_sample(struct drm_i915_private *dev_priv,
+				struct i915_oa_rcs_node *node)
+{
+	struct perf_sample_data data;
+	struct perf_event *event = dev_priv->oa_pmu.exclusive_event;
+	int format_size, snapshot_size;
+	u8 *snapshot;
+	struct drm_i915_oa_node_ctx_id *ctx_info;
+	struct perf_raw_record raw;
+
+	format_size = dev_priv->oa_pmu.oa_rcs_buffer.format_size;
+	snapshot_size = format_size + sizeof(*ctx_info);
+	snapshot = dev_priv->oa_pmu.oa_rcs_buffer.addr + node->offset;
+
+	ctx_info = (struct drm_i915_oa_node_ctx_id *)(snapshot + format_size);
+	ctx_info->ctx_id = node->ctx_id;
+
+	perf_sample_data_init(&data, 0, event->hw.last_period);
+
+	/* Note: the raw sample consists of a u32 size member and raw data. The
+	 * combined size of these two fields is required to be 8 byte aligned.
+	 * The size of raw data field is assumed to be 8 byte aligned already.
+	 * Therefore, adding 4 bytes to the total size here. We can't use
+	 * BUILD_BUG_ON here as snapshot size is derived at runtime.
+	 */
+	raw.size = snapshot_size + 4;
+	raw.data = snapshot;
+
+	data.raw = &raw;
+
+	perf_event_overflow(event, &data, &dev_priv->oa_pmu.dummy_regs);
+}
+
+/*
+ * Routine to forward the samples to perf. This may be called from the event
+ * flush and worker thread. This function may sleep, hence can't be called from
+ * atomic contexts directly.
+ */
+static void forward_oa_rcs_snapshots(struct drm_i915_private *dev_priv)
+{
+	struct i915_oa_rcs_node *entry, *next;
+	LIST_HEAD(deferred_list_free);
+	int ret;
+
+	list_for_each_entry_safe
+		(entry, next, &dev_priv->oa_pmu.node_list, head) {
+		if (!i915_gem_request_completed(entry->req, true))
+			break;
+
+		forward_one_oa_rcs_sample(dev_priv, entry);
+
+		spin_lock(&dev_priv->oa_pmu.lock);
+		list_move_tail(&entry->head, &deferred_list_free);
+		spin_unlock(&dev_priv->oa_pmu.lock);
+	}
+
+	ret = i915_mutex_lock_interruptible(dev_priv->dev);
+	if (ret)
+		return;
+	while (!list_empty(&deferred_list_free)) {
+		entry = list_first_entry(&deferred_list_free,
+					struct i915_oa_rcs_node, head);
+		i915_gem_request_unreference(entry->req);
+		list_del(&entry->head);
+		kfree(entry);
+	}
+	mutex_unlock(&dev_priv->dev->struct_mutex);
+}
+
+/*
+ * Work fn to forward the snapshots. The forwarding of samples is trigged from
+ * hrtimer and event_stop (both atomic contexts). The forward function may
+ * sleep, hence the need for worker.
+ */
+static void forward_oa_rcs_work_fn(struct work_struct *__work)
+{
+	struct drm_i915_private *dev_priv =
+		container_of(__work, typeof(*dev_priv), oa_pmu.forward_work);
+
+	forward_oa_rcs_snapshots(dev_priv);
+}
+
 static void
 oa_rcs_buffer_destroy(struct drm_i915_private *i915)
 {
@@ -361,7 +482,7 @@ static int init_oa_rcs_buffer(struct perf_event *event)
 	struct drm_i915_private *dev_priv =
 		container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu);
 	struct drm_i915_gem_object *bo;
-	int ret;
+	int ret, node_size;
 
 	BUG_ON(dev_priv->oa_pmu.oa_rcs_buffer.obj);
 
@@ -373,6 +494,16 @@ static int init_oa_rcs_buffer(struct perf_event *event)
 	dev_priv->oa_pmu.oa_rcs_buffer.gtt_offset =
 				i915_gem_obj_ggtt_offset(bo);
 	dev_priv->oa_pmu.oa_rcs_buffer.addr = vmap_oa_buffer(bo);
+	INIT_LIST_HEAD(&dev_priv->oa_pmu.node_list);
+
+	node_size = dev_priv->oa_pmu.oa_rcs_buffer.format_size +
+			sizeof(struct drm_i915_oa_node_ctx_id);
+
+	/* node size has to be aligned to 64 bytes, since only 64 byte aligned
+	 * addresses can be given to OA unit for dumping OA reports */
+	node_size = ALIGN(node_size, 64);
+	dev_priv->oa_pmu.oa_rcs_buffer.node_size = node_size;
+	dev_priv->oa_pmu.oa_rcs_buffer.node_count = bo->base.size / node_size;
 
 	DRM_DEBUG_DRIVER("OA RCS Buffer initialized, vaddr = %p",
 			 dev_priv->oa_pmu.oa_rcs_buffer.addr);
@@ -846,7 +977,14 @@ static int i915_oa_event_flush(struct perf_event *event)
 	if (event->attr.sample_period) {
 		struct drm_i915_private *i915 =
 			container_of(event->pmu, typeof(*i915), oa_pmu.pmu);
+		int ret;
 
+		if (i915->oa_pmu.multiple_ctx_mode) {
+			ret = i915_oa_rcs_wait_gpu(i915);
+			if (ret)
+				return ret;
+			forward_oa_rcs_snapshots(i915);
+		}
 		flush_oa_snapshots(i915, true);
 	}
 
@@ -942,6 +1080,8 @@ void i915_oa_pmu_register(struct drm_device *dev)
 	hrtimer_init(&i915->oa_pmu.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	i915->oa_pmu.timer.function = hrtimer_sample;
 
+	INIT_WORK(&i915->oa_pmu.forward_work, forward_oa_rcs_work_fn);
+
 	spin_lock_init(&i915->oa_pmu.lock);
 
 	i915->oa_pmu.pmu.capabilities  = PERF_PMU_CAP_IS_DEVICE;
@@ -971,6 +1111,9 @@ void i915_oa_pmu_unregister(struct drm_device *dev)
 	if (i915->oa_pmu.pmu.event_init == NULL)
 		return;
 
+	if (i915->oa_pmu.multiple_ctx_mode)
+		cancel_work_sync(&i915->oa_pmu.forward_work);
+
 	unregister_sysctl_table(i915->oa_pmu.sysctl_header);
 
 	perf_pmu_unregister(&i915->oa_pmu.pmu);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index dcf7c87..e97b2fd 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -123,6 +123,11 @@ enum drm_i915_oa_event_type {
 	I915_OA_RECORD_MAX,			/* non-ABI */
 };
 
+struct drm_i915_oa_node_ctx_id {
+	__u32 ctx_id;
+	__u32 pad;
+};
+
 /* Each region is a minimum of 16k, and there are at most 255 of them.
  */
 #define I915_NR_TEX_REGIONS 255	/* table size 2k - maximum due to use
-- 
1.8.5.1