[Intel-gfx] [RFC 7/7] drm/i915: Add support for retrieving MMIO register values in Gen Perf PMU

sourab.gupta at intel.com sourab.gupta at intel.com
Mon Jun 22 02:55:09 PDT 2015


From: Sourab Gupta <sourab.gupta at intel.com>

This patch adds support for retrieving MMIO register values through Gen Perf PMU
interface. Through this interface, now the userspace can request upto 8 MMIO
register values to be dumped, alongwith the timestamp values which were dumped
earlier across the batchbuffer boundaries.
Userspace can pass the addresses of upto 8 MMIO registers through perf attr
config. The commands to dump the values of these MMIO registers are then
inserted into the ring alongwith commands to dump the timestamps.

Signed-off-by: Sourab Gupta <sourab.gupta at intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h     |   4 +-
 drivers/gpu/drm/i915/i915_oa_perf.c | 119 ++++++++++++++++++++++++++++++++----
 include/uapi/drm/i915_drm.h         |   9 ++-
 3 files changed, 117 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a0e1d17..1f86358 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1718,9 +1718,10 @@ struct drm_i915_ts_node_info {
 	struct drm_i915_gem_request *req;
 };
 
-struct drm_i915_ts_node {
+struct drm_i915_ts_mmio_node {
 	/* ensure timestamp starts on a qword boundary */
 	struct drm_i915_ts_data timestamp;
+	__u32 mmio[8];
 	struct drm_i915_ts_node_info node_info;
 };
 #endif
@@ -2024,6 +2025,7 @@ struct drm_i915_private {
 		struct work_struct work_timer;
 		struct work_struct work_event_stop;
 		struct completion complete;
+		u32 mmio_list[8];
 	} gen_pmu;
 
 	struct list_head profile_cmd;
diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c
index ed0bdc9..465e823 100644
--- a/drivers/gpu/drm/i915/i915_oa_perf.c
+++ b/drivers/gpu/drm/i915/i915_oa_perf.c
@@ -113,10 +113,10 @@ void i915_gen_insert_cmd_ts(struct intel_ringbuffer *ringbuf, u32 ctx_id,
 			dev_priv->gen_pmu.buffer.addr;
 	void *data_ptr = (u8 *)queue_hdr + queue_hdr->data_offset;
 	int data_size =	(queue_hdr->size_in_bytes - queue_hdr->data_offset);
-	u32 node_offset, timestamp_offset, addr = 0;
-	int ret;
+	u32 node_offset, timestamp_offset, mmio_offset, addr = 0;
+	int ret, i = 0;
 
-	struct drm_i915_ts_node *nodes = data_ptr;
+	struct drm_i915_ts_mmio_node *nodes = data_ptr;
 	int num_nodes = 0;
 	int index = 0;
 
@@ -124,12 +124,14 @@ void i915_gen_insert_cmd_ts(struct intel_ringbuffer *ringbuf, u32 ctx_id,
 	index = queue_hdr->node_count % num_nodes;
 
 	timestamp_offset = offsetof(struct drm_i915_ts_data, ts_low);
+	mmio_offset =
+		offsetof(struct drm_i915_ts_mmio_node, mmio);
 
 	node_offset = i915_gem_obj_ggtt_offset(dev_priv->gen_pmu.buffer.obj) +
 			queue_hdr->data_offset +
-			index * sizeof(struct drm_i915_ts_node);
+			index * sizeof(struct drm_i915_ts_mmio_node);
 	addr = node_offset +
-		offsetof(struct drm_i915_ts_node, timestamp) +
+		offsetof(struct drm_i915_ts_mmio_node, timestamp) +
 		timestamp_offset;
 
 	if (ring->id == RCS) {
@@ -158,6 +160,27 @@ void i915_gen_insert_cmd_ts(struct intel_ringbuffer *ringbuf, u32 ctx_id,
 		intel_ring_emit(ring, 0); /* imm high, must be zero */
 		intel_ring_advance(ring);
 	}
+
+	for (i = 0; i < 8; i++) {
+		if (0 == dev_priv->gen_pmu.mmio_list[i])
+			break;
+
+		addr = node_offset + mmio_offset +
+			i * sizeof(dev_priv->gen_pmu.mmio_list[i]);
+
+		ret = intel_ring_begin(ring, 4);
+		if (ret)
+			return;
+
+		intel_ring_emit(ring,
+				MI_STORE_REGISTER_MEM(1) |
+				MI_SRM_LRM_GLOBAL_GTT);
+		intel_ring_emit(ring, dev_priv->gen_pmu.mmio_list[i]);
+		intel_ring_emit(ring, addr);
+		intel_ring_emit(ring, MI_NOOP);
+		intel_ring_advance(ring);
+	}
+
 	node_info = &nodes[index].node_info;
 	i915_gem_request_assign(&node_info->req,
 				ring->outstanding_lazy_request);
@@ -314,11 +337,11 @@ static void init_gen_pmu_buf_queue(struct drm_i915_private *dev_priv)
 }
 
 static void forward_one_gen_pmu_sample(struct drm_i915_private *dev_priv,
-				struct drm_i915_ts_node *node)
+				struct drm_i915_ts_mmio_node *node)
 {
 	struct perf_sample_data data;
 	struct perf_event *event = dev_priv->gen_pmu.exclusive_event;
-	int snapshot_size = sizeof(struct drm_i915_ts_usernode);
+	int snapshot_size = sizeof(struct drm_i915_ts_mmio_usernode);
 	struct perf_raw_record raw;
 
 	perf_sample_data_init(&data, 0, event->hw.last_period);
@@ -338,11 +361,11 @@ void i915_gen_pmu_wait_gpu(struct drm_i915_private *dev_priv)
 	struct drm_i915_ts_queue_header *hdr =
 		(struct drm_i915_ts_queue_header *)
 		dev_priv->gen_pmu.buffer.addr;
-	struct drm_i915_ts_node *first_node, *node;
+	struct drm_i915_ts_mmio_node *first_node, *node;
 	int head, tail, num_nodes, ret;
 	struct drm_i915_gem_request *req;
 
-	first_node = (struct drm_i915_ts_node *)
+	first_node = (struct drm_i915_ts_mmio_node *)
 			((char *)hdr + hdr->data_offset);
 	num_nodes = (hdr->size_in_bytes - hdr->data_offset) /
 			sizeof(*node);
@@ -375,14 +398,14 @@ void forward_gen_pmu_snapshots_work(struct work_struct *__work)
 	struct drm_i915_ts_queue_header *hdr =
 		(struct drm_i915_ts_queue_header *)
 		dev_priv->gen_pmu.buffer.addr;
-	struct drm_i915_ts_node *first_node, *node;
+	struct drm_i915_ts_mmio_node *first_node, *node;
 	int head, tail, num_nodes, ret;
 	struct drm_i915_gem_request *req;
 
 	if (dev_priv->gen_pmu.event_active == false)
 		return;
 
-	first_node = (struct drm_i915_ts_node *)
+	first_node = (struct drm_i915_ts_mmio_node *)
 			((char *)hdr + hdr->data_offset);
 	num_nodes = (hdr->size_in_bytes - hdr->data_offset) /
 			sizeof(*node);
@@ -421,11 +444,11 @@ void i915_gen_pmu_stop_work_fn(struct work_struct *__work)
 	struct drm_i915_ts_queue_header *hdr =
 		(struct drm_i915_ts_queue_header *)
 		dev_priv->gen_pmu.buffer.addr;
-	struct drm_i915_ts_node *first_node, *node;
+	struct drm_i915_ts_mmio_node *first_node, *node;
 	int head, tail, num_nodes, ret;
 	struct drm_i915_gem_request *req;
 
-	first_node = (struct drm_i915_ts_node *)
+	first_node = (struct drm_i915_ts_mmio_node *)
 			((char *)hdr + hdr->data_offset);
 	num_nodes = (hdr->size_in_bytes - hdr->data_offset) /
 			sizeof(*node);
@@ -1467,15 +1490,85 @@ static int i915_oa_event_event_idx(struct perf_event *event)
 	return 0;
 }
 
+static int i915_gen_pmu_copy_attr(struct drm_i915_gen_pmu_attr __user *uattr,
+			     struct drm_i915_gen_pmu_attr *attr)
+{
+	u32 size;
+	int ret;
+
+	if (!access_ok(VERIFY_WRITE, uattr, I915_GEN_PMU_ATTR_SIZE_VER0))
+		return -EFAULT;
+
+	/*
+	 * zero the full structure, so that a short copy will be nice.
+	 */
+	memset(attr, 0, sizeof(*attr));
+
+	ret = get_user(size, &uattr->size);
+	if (ret)
+		return ret;
+
+	if (size > PAGE_SIZE)	/* silly large */
+		goto err_size;
+
+	if (size < I915_GEN_PMU_ATTR_SIZE_VER0)
+		goto err_size;
+
+	/*
+	 * If we're handed a bigger struct than we know of,
+	 * ensure all the unknown bits are 0 - i.e. new
+	 * user-space does not rely on any kernel feature
+	 * extensions we dont know about yet.
+	 */
+	if (size > sizeof(*attr)) {
+		unsigned char __user *addr;
+		unsigned char __user *end;
+		unsigned char val;
+
+		addr = (void __user *)uattr + sizeof(*attr);
+		end  = (void __user *)uattr + size;
+
+		for (; addr < end; addr++) {
+			ret = get_user(val, addr);
+			if (ret)
+				return ret;
+			if (val)
+				goto err_size;
+		}
+		size = sizeof(*attr);
+	}
+
+	ret = copy_from_user(attr, uattr, size);
+	if (ret)
+		return -EFAULT;
+
+out:
+	return ret;
+
+err_size:
+	put_user(sizeof(*attr), &uattr->size);
+	ret = -E2BIG;
+	goto out;
+}
+
 static int i915_gen_event_init(struct perf_event *event)
 {
 	struct drm_i915_private *dev_priv =
 		container_of(event->pmu, typeof(*dev_priv), gen_pmu.pmu);
+	struct drm_i915_gen_pmu_attr gen_attr;
 	int ret = 0;
 
 	if (event->attr.type != event->pmu->type)
 		return -ENOENT;
 
+	ret = i915_gen_pmu_copy_attr(to_user_ptr(event->attr.config),
+				&gen_attr);
+	if (ret)
+		return ret;
+
+	memcpy(dev_priv->gen_pmu.mmio_list, gen_attr.mmio_list,
+			sizeof(dev_priv->gen_pmu.mmio_list));
+
 	/* To avoid the complexity of having to accurately filter
 	 * data and marshal to the appropriate client
 	 * we currently only allow exclusive access */
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index a7da421..8d4deec 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -80,6 +80,7 @@
 #define I915_OA_METRICS_SET_MAX			I915_OA_METRICS_SET_SAMPLER_BALANCE
 
 #define I915_OA_ATTR_SIZE_VER0		32  /* sizeof first published struct */
+#define I915_GEN_PMU_ATTR_SIZE_VER0	36  /* sizeof first published struct */
 
 typedef struct _drm_i915_oa_attr {
 	__u32 size;
@@ -97,6 +98,11 @@ typedef struct _drm_i915_oa_attr {
 	__reserved_2:31;
 } drm_i915_oa_attr_t;
 
+struct drm_i915_gen_pmu_attr {
+	__u32 size;
+	__u32 mmio_list[8];
+};
+
 /* Header for PERF_RECORD_DEVICE type events */
 typedef struct _drm_i915_oa_event_header {
 	__u32 type;
@@ -143,9 +149,10 @@ struct drm_i915_ts_data {
 	__u32 ts_high;
 };
 
-struct drm_i915_ts_usernode {
+struct drm_i915_ts_mmio_usernode {
 	/* ensure timestamp starts on a qword boundary */
 	struct drm_i915_ts_data timestamp;
+	__u32 mmio[8];
 	struct drm_i915_ts_node_footer node_info;
 };
 
-- 
1.8.5.1



More information about the Intel-gfx mailing list